1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 package javax.mail.internet;
21
22 import java.io.BufferedInputStream;
23 import java.io.BufferedReader;
24 import java.io.ByteArrayInputStream;
25 import java.io.ByteArrayOutputStream;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.io.InputStreamReader;
29 import java.io.OutputStream;
30 import java.io.UnsupportedEncodingException;
31 import java.util.HashMap;
32 import java.util.Map;
33 import java.util.NoSuchElementException;
34 import java.util.StringTokenizer;
35
36 import javax.activation.DataHandler;
37 import javax.activation.DataSource;
38 import javax.mail.MessagingException;
39
40 import org.apache.geronimo.mail.util.ASCIIUtil;
41 import org.apache.geronimo.mail.util.Base64;
42 import org.apache.geronimo.mail.util.Base64DecoderStream;
43 import org.apache.geronimo.mail.util.Base64Encoder;
44 import org.apache.geronimo.mail.util.Base64EncoderStream;
45 import org.apache.geronimo.mail.util.QuotedPrintableDecoderStream;
46 import org.apache.geronimo.mail.util.QuotedPrintableEncoderStream;
47 import org.apache.geronimo.mail.util.QuotedPrintableEncoder;
48 import org.apache.geronimo.mail.util.QuotedPrintable;
49 import org.apache.geronimo.mail.util.SessionUtil;
50 import org.apache.geronimo.mail.util.UUDecoderStream;
51 import org.apache.geronimo.mail.util.UUEncoderStream;
52
53 // encodings include "base64", "quoted-printable", "7bit", "8bit" and "binary".
54 // In addition, "uuencode" is also supported. The
55
56 /**
57 * @version $Rev: 627556 $ $Date: 2008-02-13 13:27:22 -0500 (Wed, 13 Feb 2008) $
58 */
59 public class MimeUtility {
60
61 private static final String MIME_FOLDENCODEDWORDS = "mail.mime.foldencodedwords";
62 private static final String MIME_DECODE_TEXT_STRICT = "mail.mime.decodetext.strict";
63 private static final String MIME_FOLDTEXT = "mail.mime.foldtext";
64 private static final int FOLD_THRESHOLD = 76;
65
66 private MimeUtility() {
67 }
68
69 public static final int ALL = -1;
70
71 private static String defaultJavaCharset;
72 private static String escapedChars = "\"\\\r\n";
73 private static String linearWhiteSpace = " \t\r\n";
74
75 private static String QP_WORD_SPECIALS = "=_?\"#$%&'(),.:;<>@[\\]^`{|}~";
76 private static String QP_TEXT_SPECIALS = "=_?";
77
78 // the javamail spec includes the ability to map java encoding names to MIME-specified names. Normally,
79 // these values are loaded from a character mapping file.
80 private static Map java2mime;
81 private static Map mime2java;
82
83 static {
84 // we need to load the mapping tables used by javaCharset() and mimeCharset().
85 loadCharacterSetMappings();
86 }
87
88 public static InputStream decode(InputStream in, String encoding) throws MessagingException {
89 encoding = encoding.toLowerCase();
90
91 // some encodies are just pass-throughs, with no real decoding.
92 if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
93 return in;
94 }
95 else if (encoding.equals("base64")) {
96 return new Base64DecoderStream(in);
97 }
98 // UUEncode is known by a couple historical extension names too.
99 else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
100 return new UUDecoderStream(in);
101 }
102 else if (encoding.equals("quoted-printable")) {
103 return new QuotedPrintableDecoderStream(in);
104 }
105 else {
106 throw new MessagingException("Unknown encoding " + encoding);
107 }
108 }
109
110 /**
111 * Decode a string of text obtained from a mail header into
112 * it's proper form. The text generally will consist of a
113 * string of tokens, some of which may be encoded using
114 * base64 encoding.
115 *
116 * @param text The text to decode.
117 *
118 * @return The decoded test string.
119 * @exception UnsupportedEncodingException
120 */
121 public static String decodeText(String text) throws UnsupportedEncodingException {
122 // if the text contains any encoded tokens, those tokens will be marked with "=?". If the
123 // source string doesn't contain that sequent, no decoding is required.
124 if (text.indexOf("=?") < 0) {
125 return text;
126 }
127
128 // we have two sets of rules we can apply.
129 if (!SessionUtil.getBooleanProperty(MIME_DECODE_TEXT_STRICT, true)) {
130 return decodeTextNonStrict(text);
131 }
132
133 int offset = 0;
134 int endOffset = text.length();
135
136 int startWhiteSpace = -1;
137 int endWhiteSpace = -1;
138
139 StringBuffer decodedText = new StringBuffer(text.length());
140
141 boolean previousTokenEncoded = false;
142
143 while (offset < endOffset) {
144 char ch = text.charAt(offset);
145
146 // is this a whitespace character?
147 if (linearWhiteSpace.indexOf(ch) != -1) {
148 startWhiteSpace = offset;
149 while (offset < endOffset) {
150 // step over the white space characters.
151 ch = text.charAt(offset);
152 if (linearWhiteSpace.indexOf(ch) != -1) {
153 offset++;
154 }
155 else {
156 // record the location of the first non lwsp and drop down to process the
157 // token characters.
158 endWhiteSpace = offset;
159 break;
160 }
161 }
162 }
163 else {
164 // we have a word token. We need to scan over the word and then try to parse it.
165 int wordStart = offset;
166
167 while (offset < endOffset) {
168 // step over the white space characters.
169 ch = text.charAt(offset);
170 if (linearWhiteSpace.indexOf(ch) == -1) {
171 offset++;
172 }
173 else {
174 break;
175 }
176
177 //NB: Trailing whitespace on these header strings will just be discarded.
178 }
179 // pull out the word token.
180 String word = text.substring(wordStart, offset);
181 // is the token encoded? decode the word
182 if (word.startsWith("=?")) {
183 try {
184 // if this gives a parsing failure, treat it like a non-encoded word.
185 String decodedWord = decodeWord(word);
186
187 // are any whitespace characters significant? Append 'em if we've got 'em.
188 if (!previousTokenEncoded) {
189 if (startWhiteSpace != -1) {
190 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
191 startWhiteSpace = -1;
192 }
193 }
194 // this is definitely a decoded token.
195 previousTokenEncoded = true;
196 // and add this to the text.
197 decodedText.append(decodedWord);
198 // we continue parsing from here...we allow parsing errors to fall through
199 // and get handled as normal text.
200 continue;
201
202 } catch (ParseException e) {
203 }
204 }
205 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
206 // if we have it.
207 if (startWhiteSpace != -1) {
208 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
209 startWhiteSpace = -1;
210 }
211 // this is not a decoded token.
212 previousTokenEncoded = false;
213 decodedText.append(word);
214 }
215 }
216
217 return decodedText.toString();
218 }
219
220
221 /**
222 * Decode a string of text obtained from a mail header into
223 * it's proper form. The text generally will consist of a
224 * string of tokens, some of which may be encoded using
225 * base64 encoding. This is for non-strict decoded for mailers that
226 * violate the RFC 2047 restriction that decoded tokens must be delimited
227 * by linear white space. This will scan tokens looking for inner tokens
228 * enclosed in "=?" -- "?=" pairs.
229 *
230 * @param text The text to decode.
231 *
232 * @return The decoded test string.
233 * @exception UnsupportedEncodingException
234 */
235 private static String decodeTextNonStrict(String text) throws UnsupportedEncodingException {
236 int offset = 0;
237 int endOffset = text.length();
238
239 int startWhiteSpace = -1;
240 int endWhiteSpace = -1;
241
242 StringBuffer decodedText = new StringBuffer(text.length());
243
244 boolean previousTokenEncoded = false;
245
246 while (offset < endOffset) {
247 char ch = text.charAt(offset);
248
249 // is this a whitespace character?
250 if (linearWhiteSpace.indexOf(ch) != -1) {
251 startWhiteSpace = offset;
252 while (offset < endOffset) {
253 // step over the white space characters.
254 ch = text.charAt(offset);
255 if (linearWhiteSpace.indexOf(ch) != -1) {
256 offset++;
257 }
258 else {
259 // record the location of the first non lwsp and drop down to process the
260 // token characters.
261 endWhiteSpace = offset;
262 break;
263 }
264 }
265 }
266 else {
267 // we're at the start of a word token. We potentially need to break this up into subtokens
268 int wordStart = offset;
269
270 while (offset < endOffset) {
271 // step over the white space characters.
272 ch = text.charAt(offset);
273 if (linearWhiteSpace.indexOf(ch) == -1) {
274 offset++;
275 }
276 else {
277 break;
278 }
279
280 //NB: Trailing whitespace on these header strings will just be discarded.
281 }
282 // pull out the word token.
283 String word = text.substring(wordStart, offset);
284
285 int decodeStart = 0;
286
287 // now scan and process each of the bits within here.
288 while (decodeStart < word.length()) {
289 int tokenStart = word.indexOf("=?", decodeStart);
290 if (tokenStart == -1) {
291 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
292 // if we have it.
293 if (startWhiteSpace != -1) {
294 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
295 startWhiteSpace = -1;
296 }
297 // this is not a decoded token.
298 previousTokenEncoded = false;
299 decodedText.append(word.substring(decodeStart));
300 // we're finished.
301 break;
302 }
303 // we have something to process
304 else {
305 // we might have a normal token preceeding this.
306 if (tokenStart != decodeStart) {
307 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
308 // if we have it.
309 if (startWhiteSpace != -1) {
310 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
311 startWhiteSpace = -1;
312 }
313 // this is not a decoded token.
314 previousTokenEncoded = false;
315 decodedText.append(word.substring(decodeStart, tokenStart));
316 }
317
318 // now find the end marker.
319 int tokenEnd = word.indexOf("?=", tokenStart);
320 // sigh, an invalid token. Treat this as plain text.
321 if (tokenEnd == -1) {
322 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
323 // if we have it.
324 if (startWhiteSpace != -1) {
325 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
326 startWhiteSpace = -1;
327 }
328 // this is not a decoded token.
329 previousTokenEncoded = false;
330 decodedText.append(word.substring(tokenStart));
331 // we're finished.
332 break;
333 }
334 else {
335 // update our ticker
336 decodeStart = tokenEnd + 2;
337
338 String token = word.substring(tokenStart, tokenEnd);
339 try {
340 // if this gives a parsing failure, treat it like a non-encoded word.
341 String decodedWord = decodeWord(token);
342
343 // are any whitespace characters significant? Append 'em if we've got 'em.
344 if (!previousTokenEncoded) {
345 if (startWhiteSpace != -1) {
346 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
347 startWhiteSpace = -1;
348 }
349 }
350 // this is definitely a decoded token.
351 previousTokenEncoded = true;
352 // and add this to the text.
353 decodedText.append(decodedWord);
354 // we continue parsing from here...we allow parsing errors to fall through
355 // and get handled as normal text.
356 continue;
357
358 } catch (ParseException e) {
359 }
360 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
361 // if we have it.
362 if (startWhiteSpace != -1) {
363 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
364 startWhiteSpace = -1;
365 }
366 // this is not a decoded token.
367 previousTokenEncoded = false;
368 decodedText.append(token);
369 }
370 }
371 }
372 }
373 }
374
375 return decodedText.toString();
376 }
377
378 /**
379 * Parse a string using the RFC 2047 rules for an "encoded-word"
380 * type. This encoding has the syntax:
381 *
382 * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
383 *
384 * @param word The possibly encoded word value.
385 *
386 * @return The decoded word.
387 * @exception ParseException
388 * @exception UnsupportedEncodingException
389 */
390 public static String decodeWord(String word) throws ParseException, UnsupportedEncodingException {
391 // encoded words start with the characters "=?". If this not an encoded word, we throw a
392 // ParseException for the caller.
393
394 if (!word.startsWith("=?")) {
395 throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
396 }
397
398 int charsetPos = word.indexOf('?', 2);
399 if (charsetPos == -1) {
400 throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
401 }
402
403 // pull out the character set information (this is the MIME name at this point).
404 String charset = word.substring(2, charsetPos).toLowerCase();
405
406 // now pull out the encoding token the same way.
407 int encodingPos = word.indexOf('?', charsetPos + 1);
408 if (encodingPos == -1) {
409 throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
410 }
411
412 String encoding = word.substring(charsetPos + 1, encodingPos);
413
414 // and finally the encoded text.
415 int encodedTextPos = word.indexOf("?=", encodingPos + 1);
416 if (encodedTextPos == -1) {
417 throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
418 }
419
420 String encodedText = word.substring(encodingPos + 1, encodedTextPos);
421
422 // seems a bit silly to encode a null string, but easy to deal with.
423 if (encodedText.length() == 0) {
424 return "";
425 }
426
427 try {
428 // the decoder writes directly to an output stream.
429 ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());
430
431 byte[] encodedData = encodedText.getBytes("US-ASCII");
432
433 // Base64 encoded?
434 if (encoding.equals("B")) {
435 Base64.decode(encodedData, out);
436 }
437 // maybe quoted printable.
438 else if (encoding.equals("Q")) {
439 QuotedPrintableEncoder dataEncoder = new QuotedPrintableEncoder();
440 dataEncoder.decodeWord(encodedData, out);
441 }
442 else {
443 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
444 }
445 // get the decoded byte data and convert into a string.
446 byte[] decodedData = out.toByteArray();
447 return new String(decodedData, javaCharset(charset));
448 } catch (IOException e) {
449 throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
450 }
451
452 }
453
454 /**
455 * Wrap an encoder around a given output stream.
456 *
457 * @param out The output stream to wrap.
458 * @param encoding The name of the encoding.
459 *
460 * @return A instance of FilterOutputStream that manages on the fly
461 * encoding for the requested encoding type.
462 * @exception MessagingException
463 */
464 public static OutputStream encode(OutputStream out, String encoding) throws MessagingException {
465 // no encoding specified, so assume it goes out unchanged.
466 if (encoding == null) {
467 return out;
468 }
469
470 encoding = encoding.toLowerCase();
471
472 // some encodies are just pass-throughs, with no real decoding.
473 if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
474 return out;
475 }
476 else if (encoding.equals("base64")) {
477 return new Base64EncoderStream(out);
478 }
479 // UUEncode is known by a couple historical extension names too.
480 else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
481 return new UUEncoderStream(out);
482 }
483 else if (encoding.equals("quoted-printable")) {
484 return new QuotedPrintableEncoderStream(out);
485 }
486 else {
487 throw new MessagingException("Unknown encoding " + encoding);
488 }
489 }
490
491 /**
492 * Wrap an encoder around a given output stream.
493 *
494 * @param out The output stream to wrap.
495 * @param encoding The name of the encoding.
496 * @param filename The filename of the data being sent (only used for UUEncode).
497 *
498 * @return A instance of FilterOutputStream that manages on the fly
499 * encoding for the requested encoding type.
500 * @exception MessagingException
501 */
502 public static OutputStream encode(OutputStream out, String encoding, String filename) throws MessagingException {
503 encoding = encoding.toLowerCase();
504
505 // some encodies are just pass-throughs, with no real decoding.
506 if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
507 return out;
508 }
509 else if (encoding.equals("base64")) {
510 return new Base64EncoderStream(out);
511 }
512 // UUEncode is known by a couple historical extension names too.
513 else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
514 return new UUEncoderStream(out, filename);
515 }
516 else if (encoding.equals("quoted-printable")) {
517 return new QuotedPrintableEncoderStream(out);
518 }
519 else {
520 throw new MessagingException("Unknown encoding " + encoding);
521 }
522 }
523
524
525 public static String encodeText(String word) throws UnsupportedEncodingException {
526 return encodeText(word, null, null);
527 }
528
529 public static String encodeText(String word, String charset, String encoding) throws UnsupportedEncodingException {
530 return encodeWord(word, charset, encoding, false);
531 }
532
533 public static String encodeWord(String word) throws UnsupportedEncodingException {
534 return encodeWord(word, null, null);
535 }
536
537 public static String encodeWord(String word, String charset, String encoding) throws UnsupportedEncodingException {
538 return encodeWord(word, charset, encoding, true);
539 }
540
541
542 private static String encodeWord(String word, String charset, String encoding, boolean encodingWord) throws UnsupportedEncodingException {
543
544 // figure out what we need to encode this.
545 String encoder = ASCIIUtil.getTextTransferEncoding(word);
546 // all ascii? We can return this directly,
547 if (encoder.equals("7bit")) {
548 return word;
549 }
550
551 // if not given a charset, use the default.
552 if (charset == null) {
553 charset = getDefaultMIMECharset();
554 }
555
556 // sort out the encoder. If not explicitly given, use the best guess we've already established.
557 if (encoding != null) {
558 if (encoding.equalsIgnoreCase("B")) {
559 encoder = "base64";
560 }
561 else if (encoding.equalsIgnoreCase("Q")) {
562 encoder = "quoted-printable";
563 }
564 else {
565 throw new UnsupportedEncodingException("Unknown transfer encoding: " + encoding);
566 }
567 }
568
569 try {
570
571 // we'll format this directly into the string buffer
572 StringBuffer result = new StringBuffer();
573
574 // this is the maximum size of a segment of encoded data, which is based off
575 // of a 75 character size limit and all of the encoding overhead elements.
576 int sizeLimit = 75 - 7 - charset.length();
577
578 // now do the appropriate encoding work
579 if (encoder.equals("base64")) {
580 Base64Encoder dataEncoder = new Base64Encoder();
581 // this may recurse on the encoding if the string is too long. The left-most will not
582 // get a segment delimiter
583 encodeBase64(word, result, sizeLimit, charset, dataEncoder, true, SessionUtil.getBooleanProperty(MIME_FOLDENCODEDWORDS, false));
584 }
585 else {
586 QuotedPrintableEncoder dataEncoder = new QuotedPrintableEncoder();
587 encodeQuotedPrintable(word, result, sizeLimit, charset, dataEncoder, true,
588 SessionUtil.getBooleanProperty(MIME_FOLDENCODEDWORDS, false), encodingWord ? QP_WORD_SPECIALS : QP_TEXT_SPECIALS);
589 }
590 return result.toString();
591 } catch (IOException e) {
592 throw new UnsupportedEncodingException("Invalid encoding");
593 }
594 }
595
596
597 /**
598 * Encode a string into base64 encoding, taking into
599 * account the maximum segment length.
600 *
601 * @param data The string data to encode.
602 * @param out The output buffer used for the result.
603 * @param sizeLimit The maximum amount of encoded data we're allowed
604 * to have in a single encoded segment.
605 * @param charset The character set marker that needs to be added to the
606 * encoding header.
607 * @param encoder The encoder instance we're using.
608 * @param firstSegment
609 * If true, this is the first (left-most) segment in the
610 * data. Used to determine if segment delimiters need to
611 * be added between sections.
612 * @param foldSegments
613 * Indicates the type of delimiter to use (blank or newline sequence).
614 */
615 static private void encodeBase64(String data, StringBuffer out, int sizeLimit, String charset, Base64Encoder encoder, boolean firstSegment, boolean foldSegments) throws IOException
616 {
617 // this needs to be converted into the appropriate transfer encoding.
618 byte [] bytes = data.getBytes(javaCharset(charset));
619
620 int estimatedSize = encoder.estimateEncodedLength(bytes);
621
622 // if the estimated encoding size is over our segment limit, split the string in half and
623 // recurse. Eventually we'll reach a point where things are small enough.
624 if (estimatedSize > sizeLimit) {
625 // the first segment indicator travels with the left half.
626 encodeBase64(data.substring(0, data.length() / 2), out, sizeLimit, charset, encoder, firstSegment, foldSegments);
627 // the second half can never be the first segment
628 encodeBase64(data.substring(data.length() / 2), out, sizeLimit, charset, encoder, false, foldSegments);
629 }
630 else
631 {
632 // if this is not the first sement of the encoding, we need to add either a blank or
633 // a newline sequence to the data
634 if (!firstSegment) {
635 if (foldSegments) {
636 out.append("\r\n");
637 }
638 else {
639 out.append(' ');
640 }
641 }
642 // do the encoding of the segment.
643 encoder.encodeWord(bytes, out, charset);
644 }
645 }
646
647
648 /**
649 * Encode a string into quoted printable encoding, taking into
650 * account the maximum segment length.
651 *
652 * @param data The string data to encode.
653 * @param out The output buffer used for the result.
654 * @param sizeLimit The maximum amount of encoded data we're allowed
655 * to have in a single encoded segment.
656 * @param charset The character set marker that needs to be added to the
657 * encoding header.
658 * @param encoder The encoder instance we're using.
659 * @param firstSegment
660 * If true, this is the first (left-most) segment in the
661 * data. Used to determine if segment delimiters need to
662 * be added between sections.
663 * @param foldSegments
664 * Indicates the type of delimiter to use (blank or newline sequence).
665 */
666 static private void encodeQuotedPrintable(String data, StringBuffer out, int sizeLimit, String charset, QuotedPrintableEncoder encoder,
667 boolean firstSegment, boolean foldSegments, String specials) throws IOException
668 {
669 // this needs to be converted into the appropriate transfer encoding.
670 byte [] bytes = data.getBytes(javaCharset(charset));
671
672 int estimatedSize = encoder.estimateEncodedLength(bytes, specials);
673
674 // if the estimated encoding size is over our segment limit, split the string in half and
675 // recurse. Eventually we'll reach a point where things are small enough.
676 if (estimatedSize > sizeLimit) {
677 // the first segment indicator travels with the left half.
678 encodeQuotedPrintable(data.substring(0, data.length() / 2), out, sizeLimit, charset, encoder, firstSegment, foldSegments, specials);
679 // the second half can never be the first segment
680 encodeQuotedPrintable(data.substring(data.length() / 2), out, sizeLimit, charset, encoder, false, foldSegments, specials);
681 }
682 else
683 {
684 // if this is not the first sement of the encoding, we need to add either a blank or
685 // a newline sequence to the data
686 if (!firstSegment) {
687 if (foldSegments) {
688 out.append("\r\n");
689 }
690 else {
691 out.append(' ');
692 }
693 }
694 // do the encoding of the segment.
695 encoder.encodeWord(bytes, out, charset, specials);
696 }
697 }
698
699
700 /**
701 * Examine the content of a data source and decide what type
702 * of transfer encoding should be used. For text streams,
703 * we'll decided between 7bit, quoted-printable, and base64.
704 * For binary content types, we'll use either 7bit or base64.
705 *
706 * @param handler The DataHandler associated with the content.
707 *
708 * @return The string name of an encoding used to transfer the content.
709 */
710 public static String getEncoding(DataHandler handler) {
711
712
713 // if this handler has an associated data source, we can read directly from the
714 // data source to make this judgment. This is generally MUCH faster than asking the
715 // DataHandler to write out the data for us.
716 DataSource ds = handler.getDataSource();
717 if (ds != null) {
718 return getEncoding(ds);
719 }
720
721 try {
722 // get a parser that allows us to make comparisons.
723 ContentType content = new ContentType(ds.getContentType());
724
725 // The only access to the content bytes at this point is by asking the handler to write
726 // the information out to a stream. We're going to pipe this through a special stream
727 // that examines the bytes as they go by.
728 ContentCheckingOutputStream checker = new ContentCheckingOutputStream();
729
730 handler.writeTo(checker);
731
732 // figure this out based on whether we believe this to be a text type or not.
733 if (content.match("text/*")) {
734 return checker.getTextTransferEncoding();
735 }
736 else {
737 return checker.getBinaryTransferEncoding();
738 }
739
740 } catch (Exception e) {
741 // any unexpected I/O exceptions we'll force to a "safe" fallback position.
742 return "base64";
743 }
744 }
745
746
747 /**
748 * Determine the what transfer encoding should be used for
749 * data retrieved from a DataSource.
750 *
751 * @param source The DataSource for the transmitted data.
752 *
753 * @return The string name of the encoding form that should be used for
754 * the data.
755 */
756 public static String getEncoding(DataSource source) {
757 InputStream in = null;
758
759 try {
760 // get a parser that allows us to make comparisons.
761 ContentType content = new ContentType(source.getContentType());
762
763 // we're probably going to have to scan the data.
764 in = source.getInputStream();
765
766 if (!content.match("text/*")) {
767 // Not purporting to be a text type? Examine the content to see we might be able to
768 // at least pretend it is an ascii type.
769 return ASCIIUtil.getBinaryTransferEncoding(in);
770 }
771 else {
772 return ASCIIUtil.getTextTransferEncoding(in);
773 }
774 } catch (Exception e) {
775 // this was a problem...not sure what makes sense here, so we'll assume it's binary
776 // and we need to transfer this using Base64 encoding.
777 return "base64";
778 } finally {
779 // make sure we close the stream
780 try {
781 if (in != null) {
782 in.close();
783 }
784 } catch (IOException e) {
785 }
786 }
787 }
788
789
790 /**
791 * Quote a "word" value. If the word contains any character from
792 * the specified "specials" list, this value is returned as a
793 * quoted strong. Otherwise, it is returned unchanged (an "atom").
794 *
795 * @param word The word requiring quoting.
796 * @param specials The set of special characters that can't appear in an unquoted
797 * string.
798 *
799 * @return The quoted value. This will be unchanged if the word doesn't contain
800 * any of the designated special characters.
801 */
802 public static String quote(String word, String specials) {
803 int wordLength = word.length();
804 boolean requiresQuoting = false;
805 // scan the string looking for problem characters
806 for (int i =0; i < wordLength; i++) {
807 char ch = word.charAt(i);
808 // special escaped characters require escaping, which also implies quoting.
809 if (escapedChars.indexOf(ch) >= 0) {
810 return quoteAndEscapeString(word);
811 }
812 // now check for control characters or the designated special characters.
813 if (ch < 32 || ch >= 127 || specials.indexOf(ch) >= 0) {
814 // we know this requires quoting, but we still need to scan the entire string to
815 // see if contains chars that require escaping. Just go ahead and treat it as if it does.
816 return quoteAndEscapeString(word);
817 }
818 }
819 return word;
820 }
821
822 /**
823 * Take a string and return it as a formatted quoted string, with
824 * all characters requiring escaping handled properly.
825 *
826 * @param word The string to quote.
827 *
828 * @return The quoted string.
829 */
830 private static String quoteAndEscapeString(String word) {
831 int wordLength = word.length();
832 // allocate at least enough for the string and two quotes plus a reasonable number of escaped chars.
833 StringBuffer buffer = new StringBuffer(wordLength + 10);
834 // add the leading quote.
835 buffer.append('"');
836
837 for (int i = 0; i < wordLength; i++) {
838 char ch = word.charAt(i);
839 // is this an escaped char?
840 if (escapedChars.indexOf(ch) >= 0) {
841 // add the escape marker before appending.
842 buffer.append('\\');
843 }
844 buffer.append(ch);
845 }
846 // now the closing quote
847 buffer.append('"');
848 return buffer.toString();
849 }
850
851 /**
852 * Translate a MIME standard character set name into the Java
853 * equivalent.
854 *
855 * @param charset The MIME standard name.
856 *
857 * @return The Java equivalent for this name.
858 */
859 public static String javaCharset(String charset) {
860 // nothing in, nothing out.
861 if (charset == null) {
862 return null;
863 }
864
865 String mappedCharset = (String)mime2java.get(charset.toLowerCase());
866 // if there is no mapping, then the original name is used. Many of the MIME character set
867 // names map directly back into Java. The reverse isn't necessarily true.
868 return mappedCharset == null ? charset : mappedCharset;
869 }
870
871 /**
872 * Map a Java character set name into the MIME equivalent.
873 *
874 * @param charset The java character set name.
875 *
876 * @return The MIME standard equivalent for this character set name.
877 */
878 public static String mimeCharset(String charset) {
879 // nothing in, nothing out.
880 if (charset == null) {
881 return null;
882 }
883
884 String mappedCharset = (String)java2mime.get(charset.toLowerCase());
885 // if there is no mapping, then the original name is used. Many of the MIME character set
886 // names map directly back into Java. The reverse isn't necessarily true.
887 return mappedCharset == null ? charset : mappedCharset;
888 }
889
890
891 /**
892 * Get the default character set to use, in Java name format.
893 * This either be the value set with the mail.mime.charset
894 * system property or obtained from the file.encoding system
895 * property. If neither of these is set, we fall back to
896 * 8859_1 (basically US-ASCII).
897 *
898 * @return The character string value of the default character set.
899 */
900 public static String getDefaultJavaCharset() {
901 String charset = SessionUtil.getProperty("mail.mime.charset");
902 if (charset != null) {
903 return javaCharset(charset);
904 }
905 return SessionUtil.getProperty("file.encoding", "8859_1");
906 }
907
908 /**
909 * Get the default character set to use, in MIME name format.
910 * This either be the value set with the mail.mime.charset
911 * system property or obtained from the file.encoding system
912 * property. If neither of these is set, we fall back to
913 * 8859_1 (basically US-ASCII).
914 *
915 * @return The character string value of the default character set.
916 */
917 static String getDefaultMIMECharset() {
918 // if the property is specified, this can be used directly.
919 String charset = SessionUtil.getProperty("mail.mime.charset");
920 if (charset != null) {
921 return charset;
922 }
923
924 // get the Java-defined default and map back to a MIME name.
925 return mimeCharset(SessionUtil.getProperty("file.encoding", "8859_1"));
926 }
927
928
929 /**
930 * Load the default mapping tables used by the javaCharset()
931 * and mimeCharset() methods. By default, these tables are
932 * loaded from the /META-INF/javamail.charset.map file. If
933 * something goes wrong loading that file, we configure things
934 * with a default mapping table (which just happens to mimic
935 * what's in the default mapping file).
936 */
937 static private void loadCharacterSetMappings() {
938 java2mime = new HashMap();
939 mime2java = new HashMap();
940
941
942 // normally, these come from a character map file contained in the jar file.
943 try {
944 InputStream map = javax.mail.internet.MimeUtility.class.getResourceAsStream("/META-INF/javamail.charset.map");
945
946 if (map != null) {
947 // get a reader for this so we can load.
948 BufferedReader reader = new BufferedReader(new InputStreamReader(map));
949
950 readMappings(reader, java2mime);
951 readMappings(reader, mime2java);
952 }
953 } catch (Exception e) {
954 }
955
956 // if any sort of error occurred reading the preferred file version, we could end up with empty
957 // mapping tables. This could cause all sorts of difficulty, so ensure they are populated with at
958 // least a reasonable set of defaults.
959
960 // these mappings echo what's in the default file.
961 if (java2mime.isEmpty()) {
962 java2mime.put("8859_1", "ISO-8859-1");
963 java2mime.put("iso8859_1", "ISO-8859-1");
964 java2mime.put("iso8859-1", "ISO-8859-1");
965
966 java2mime.put("8859_2", "ISO-8859-2");
967 java2mime.put("iso8859_2", "ISO-8859-2");
968 java2mime.put("iso8859-2", "ISO-8859-2");
969
970 java2mime.put("8859_3", "ISO-8859-3");
971 java2mime.put("iso8859_3", "ISO-8859-3");
972 java2mime.put("iso8859-3", "ISO-8859-3");
973
974 java2mime.put("8859_4", "ISO-8859-4");
975 java2mime.put("iso8859_4", "ISO-8859-4");
976 java2mime.put("iso8859-4", "ISO-8859-4");
977
978 java2mime.put("8859_5", "ISO-8859-5");
979 java2mime.put("iso8859_5", "ISO-8859-5");
980 java2mime.put("iso8859-5", "ISO-8859-5");
981
982 java2mime.put ("8859_6", "ISO-8859-6");
983 java2mime.put("iso8859_6", "ISO-8859-6");
984 java2mime.put("iso8859-6", "ISO-8859-6");
985
986 java2mime.put("8859_7", "ISO-8859-7");
987 java2mime.put("iso8859_7", "ISO-8859-7");
988 java2mime.put("iso8859-7", "ISO-8859-7");
989
990 java2mime.put("8859_8", "ISO-8859-8");
991 java2mime.put("iso8859_8", "ISO-8859-8");
992 java2mime.put("iso8859-8", "ISO-8859-8");
993
994 java2mime.put("8859_9", "ISO-8859-9");
995 java2mime.put("iso8859_9", "ISO-8859-9");
996 java2mime.put("iso8859-9", "ISO-8859-9");
997
998 java2mime.put("sjis", "Shift_JIS");
999 java2mime.put ("jis", "ISO-2022-JP");
1000 java2mime.put("iso2022jp", "ISO-2022-JP");
1001 java2mime.put("euc_jp", "euc-jp");
1002 java2mime.put("koi8_r", "koi8-r");
1003 java2mime.put("euc_cn", "euc-cn");
1004 java2mime.put("euc_tw", "euc-tw");
1005 java2mime.put("euc_kr", "euc-kr");
1006 }
1007
1008 if (mime2java.isEmpty ()) {
1009 mime2java.put("iso-2022-cn", "ISO2022CN");
1010 mime2java.put("iso-2022-kr", "ISO2022KR");
1011 mime2java.put("utf-8", "UTF8");
1012 mime2java.put("utf8", "UTF8");
1013 mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
1014 mime2java.put("ja_jp.eucjp", "EUCJIS");
1015 mime2java.put ("euc-kr", "KSC5601");
1016 mime2java.put("euckr", "KSC5601");
1017 mime2java.put("us-ascii", "ISO-8859-1");
1018 mime2java.put("x-us-ascii", "ISO-8859-1");
1019 }
1020 }
1021
1022
1023 /**
1024 * Read a section of a character map table and populate the
1025 * target mapping table with the information. The table end
1026 * is marked by a line starting with "--" and also ending with
1027 * "--". Blank lines and comment lines (beginning with '#') are
1028 * ignored.
1029 *
1030 * @param reader The source of the file information.
1031 * @param table The mapping table used to store the information.
1032 */
1033 static private void readMappings(BufferedReader reader, Map table) throws IOException {
1034 // process lines to the EOF or the end of table marker.
1035 while (true) {
1036 String line = reader.readLine();
1037 // no line returned is an EOF
1038 if (line == null) {
1039 return;
1040 }
1041
1042 // trim so we're not messed up by trailing blanks
1043 line = line.trim();
1044
1045 if (line.length() == 0 || line.startsWith("#")) {
1046 continue;
1047 }
1048
1049 // stop processing if this is the end-of-table marker.
1050 if (line.startsWith("--") && line.endsWith("--")) {
1051 return;
1052 }
1053
1054 // we allow either blanks or tabs as token delimiters.
1055 StringTokenizer tokenizer = new StringTokenizer(line, " \t");
1056
1057 try {
1058 String from = tokenizer.nextToken().toLowerCase();
1059 String to = tokenizer.nextToken();
1060
1061 table.put(from, to);
1062 } catch (NoSuchElementException e) {
1063 // just ignore the line if invalid.
1064 }
1065 }
1066 }
1067
1068
1069 /**
1070 * Perform RFC 2047 text folding on a string of text.
1071 *
1072 * @param used The amount of text already "used up" on this line. This is
1073 * typically the length of a message header that this text
1074 * get getting added to.
1075 * @param s The text to fold.
1076 *
1077 * @return The input text, with linebreaks inserted at appropriate fold points.
1078 */
1079 public static String fold(int used, String s) {
1080 // if folding is disable, unfolding is also. Return the string unchanged.
1081 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
1082 return s;
1083 }
1084
1085 int end;
1086
1087 // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs,
1088 // and line break characters.
1089 for (end = s.length() - 1; end >= 0; end--) {
1090 int ch = s.charAt(end);
1091 if (ch != ' ' && ch != '\t' ) {
1092 break;
1093 }
1094 }
1095
1096 // did we actually find something to remove? Shorten the String to the trimmed length
1097 if (end != s.length() - 1) {
1098 s = s.substring(0, end + 1);
1099 }
1100
1101 // does the string as it exists now not require folding? We can just had that back right off.
1102 if (s.length() + used <= FOLD_THRESHOLD) {
1103 return s;
1104 }
1105
1106 // get a buffer for the length of the string, plus room for a few line breaks.
1107 // these are soft line breaks, so we generally need more that just the line breaks (an escape +
1108 // CR + LF + leading space on next line);
1109 StringBuffer newString = new StringBuffer(s.length() + 8);
1110
1111
1112 // now keep chopping this down until we've accomplished what we need.
1113 while (used + s.length() > FOLD_THRESHOLD) {
1114 int breakPoint = -1;
1115 char breakChar = 0;
1116
1117 // now scan for the next place where we can break.
1118 for (int i = 0; i < s.length(); i++) {
1119 // have we passed the fold limit?
1120 if (used + i > FOLD_THRESHOLD) {
1121 // if we've already seen a blank, then stop now. Otherwise
1122 // we keep going until we hit a fold point.
1123 if (breakPoint != -1) {
1124 break;
1125 }
1126 }
1127 char ch = s.charAt(i);
1128
1129 // a white space character?
1130 if (ch == ' ' || ch == '\t') {
1131 // this might be a run of white space, so skip over those now.
1132 breakPoint = i;
1133 // we need to maintain the same character type after the inserted linebreak.
1134 breakChar = ch;
1135 i++;
1136 while (i < s.length()) {
1137 ch = s.charAt(i);
1138 if (ch != ' ' && ch != '\t') {
1139 break;
1140 }
1141 i++;
1142 }
1143 }
1144 // found an embedded new line. Escape this so that the unfolding process preserves it.
1145 else if (ch == '\n') {
1146 newString.append('\\');
1147 newString.append('\n');
1148 }
1149 else if (ch == '\r') {
1150 newString.append('\\');
1151 newString.append('\n');
1152 i++;
1153 // if this is a CRLF pair, add the second char also
1154 if (i < s.length() && s.charAt(i) == '\n') {
1155 newString.append('\r');
1156 }
1157 }
1158
1159 }
1160 // no fold point found, we punt, append the remainder and leave.
1161 if (breakPoint == -1) {
1162 newString.append(s);
1163 return newString.toString();
1164 }
1165 newString.append(s.substring(0, breakPoint));
1166 newString.append("\r\n");
1167 newString.append(breakChar);
1168 // chop the string
1169 s = s.substring(breakPoint + 1);
1170 // start again, and we've used the first char of the limit already with the whitespace char.
1171 used = 1;
1172 }
1173
1174 // add on the remainder, and return
1175 newString.append(s);
1176 return newString.toString();
1177 }
1178
1179 /**
1180 * Unfold a folded string. The unfolding process will remove
1181 * any line breaks that are not escaped and which are also followed
1182 * by whitespace characters.
1183 *
1184 * @param s The folded string.
1185 *
1186 * @return A new string with unfolding rules applied.
1187 */
1188 public static String unfold(String s) {
1189 // if folding is disable, unfolding is also. Return the string unchanged.
1190 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
1191 return s;
1192 }
1193
1194 // if there are no line break characters in the string, we can just return this.
1195 if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) {
1196 return s;
1197 }
1198
1199 // we need to scan and fix things up.
1200 int length = s.length();
1201
1202 StringBuffer newString = new StringBuffer(length);
1203
1204 // scan the entire string
1205 for (int i = 0; i < length; i++) {
1206 char ch = s.charAt(i);
1207
1208 // we have a backslash. In folded strings, escape characters are only processed as such if
1209 // they preceed line breaks. Otherwise, we leave it be.
1210 if (ch == '\\') {
1211 // escape at the very end? Just add the character.
1212 if (i == length - 1) {
1213 newString.append(ch);
1214 }
1215 else {
1216 int nextChar = s.charAt(i + 1);
1217
1218 // naked newline? Add the new line to the buffer, and skip the escape char.
1219 if (nextChar == '\n') {
1220 newString.append('\n');
1221 i++;
1222 }
1223 else if (nextChar == '\r') {
1224 // just the CR left? Add it, removing the escape.
1225 if (i == length - 2 || s.charAt(i + 2) != '\r') {
1226 newString.append('\r');
1227 i++;
1228 }
1229 else {
1230 // toss the escape, add both parts of the CRLF, and skip over two chars.
1231 newString.append('\r');
1232 newString.append('\n');
1233 i += 2;
1234 }
1235 }
1236 else {
1237 // an escape for another purpose, just copy it over.
1238 newString.append(ch);
1239 }
1240 }
1241 }
1242 // we have an unescaped line break
1243 else if (ch == '\n' || ch == '\r') {
1244 // remember the position in case we need to backtrack.
1245 int lineBreak = i;
1246 boolean CRLF = false;
1247
1248 if (ch == '\r') {
1249 // check to see if we need to step over this.
1250 if (i < length - 1 && s.charAt(i + 1) == '\n') {
1251 i++;
1252 // flag the type so we know what we might need to preserve.
1253 CRLF = true;
1254 }
1255 }
1256
1257 // get a temp position scanner.
1258 int scan = i + 1;
1259
1260 // does a blank follow this new line? we need to scrap the new line and reduce the leading blanks
1261 // down to a single blank.
1262 if (scan < length && s.charAt(scan) == ' ') {
1263 // add the character
1264 newString.append(' ');
1265
1266 // scan over the rest of the blanks
1267 i = scan + 1;
1268 while (i < length && s.charAt(i) == ' ') {
1269 i++;
1270 }
1271 // we'll increment down below, so back up to the last blank as the current char.
1272 i--;
1273 }
1274 else {
1275 // we must keep this line break. Append the appropriate style.
1276 if (CRLF) {
1277 newString.append("\r\n");
1278 }
1279 else {
1280 newString.append(ch);
1281 }
1282 }
1283 }
1284 else {
1285 // just a normal, ordinary character
1286 newString.append(ch);
1287 }
1288 }
1289 return newString.toString();
1290 }
1291 }
1292
1293
1294 /**
1295 * Utility class for examining content information written out
1296 * by a DataHandler object. This stream gathers statistics on
1297 * the stream so it can make transfer encoding determinations.
1298 */
1299 class ContentCheckingOutputStream extends OutputStream {
1300 private int asciiChars = 0;
1301 private int nonAsciiChars = 0;
1302 private boolean containsLongLines = false;
1303 private boolean containsMalformedEOL = false;
1304 private int previousChar = 0;
1305 private int span = 0;
1306
1307 ContentCheckingOutputStream() {
1308 }
1309
1310 public void write(byte[] data) throws IOException {
1311 write(data, 0, data.length);
1312 }
1313
1314 public void write(byte[] data, int offset, int length) throws IOException {
1315 for (int i = 0; i < length; i++) {
1316 write(data[offset + i]);
1317 }
1318 }
1319
1320 public void write(int ch) {
1321 // we found a linebreak. Reset the line length counters on either one. We don't
1322 // really need to validate here.
1323 if (ch == '\n' || ch == '\r') {
1324 // we found a newline, this is only valid if the previous char was the '\r'
1325 if (ch == '\n') {
1326 // malformed linebreak? force this to base64 encoding.
1327 if (previousChar != '\r') {
1328 containsMalformedEOL = true;
1329 }
1330 }
1331 // hit a line end, reset our line length counter
1332 span = 0;
1333 }
1334 else {
1335 span++;
1336 // the text has long lines, we can't transfer this as unencoded text.
1337 if (span > 998) {
1338 containsLongLines = true;
1339 }
1340
1341 // non-ascii character, we have to transfer this in binary.
1342 if (!ASCIIUtil.isAscii(ch)) {
1343 nonAsciiChars++;
1344 }
1345 else {
1346 asciiChars++;
1347 }
1348 }
1349 previousChar = ch;
1350 }
1351
1352
1353 public String getBinaryTransferEncoding() {
1354 if (nonAsciiChars != 0 || containsLongLines || containsMalformedEOL) {
1355 return "base64";
1356 }
1357 else {
1358 return "7bit";
1359 }
1360 }
1361
1362 public String getTextTransferEncoding() {
1363 // looking good so far, only valid chars here.
1364 if (nonAsciiChars == 0) {
1365 // does this contain long text lines? We need to use a Q-P encoding which will
1366 // be only slightly longer, but handles folding the longer lines.
1367 if (containsLongLines) {
1368 return "quoted-printable";
1369 }
1370 else {
1371 // ideal! Easiest one to handle.
1372 return "7bit";
1373 }
1374 }
1375 else {
1376 // mostly characters requiring encoding? Base64 is our best bet.
1377 if (nonAsciiChars > asciiChars) {
1378 return "base64";
1379 }
1380 else {
1381 // Q-P encoding will use fewer bytes than the full Base64.
1382 return "quoted-printable";
1383 }
1384 }
1385 }
1386 }