MimeUtility xref

View Javadoc

1   /**
2    *
3    * Copyright 2003-2006 The Apache Software Foundation
4    *
5    *  Licensed under the Apache License, Version 2.0 (the "License");
6    *  you may not use this file except in compliance with the License.
7    *  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  
18  package javax.mail.internet;
19  
20  import java.io.BufferedInputStream;
21  import java.io.BufferedReader;
22  import java.io.ByteArrayInputStream;
23  import java.io.ByteArrayOutputStream;
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.InputStreamReader;
27  import java.io.OutputStream;
28  import java.io.UnsupportedEncodingException;
29  import java.util.HashMap;
30  import java.util.Map;
31  import java.util.NoSuchElementException;
32  import java.util.StringTokenizer;
33  
34  import javax.activation.DataHandler;
35  import javax.activation.DataSource;
36  import javax.mail.MessagingException;
37  
38  import org.apache.geronimo.mail.util.ASCIIUtil;
39  import org.apache.geronimo.mail.util.Base64;
40  import org.apache.geronimo.mail.util.Base64DecoderStream;
41  import org.apache.geronimo.mail.util.Base64Encoder;
42  import org.apache.geronimo.mail.util.Base64EncoderStream;
43  import org.apache.geronimo.mail.util.QuotedPrintableDecoderStream;
44  import org.apache.geronimo.mail.util.QuotedPrintableEncoderStream;
45  import org.apache.geronimo.mail.util.QuotedPrintableEncoder;
46  import org.apache.geronimo.mail.util.QuotedPrintable;
47  import org.apache.geronimo.mail.util.SessionUtil;
48  import org.apache.geronimo.mail.util.UUDecoderStream;
49  import org.apache.geronimo.mail.util.UUEncoderStream;
50  
51  // encodings include "base64", "quoted-printable", "7bit", "8bit" and "binary".
52  // In addition, "uuencode" is also supported. The
53  
54  /**
55   * @version $Rev: 421852 $ $Date: 2006-07-14 03:02:19 -0700 (Fri, 14 Jul 2006) $
56   */
57  public class MimeUtility {
58  
59      private static final String MIME_FOLDENCODEDWORDS = "mail.mime.foldencodedwords";
60      private static final String MIME_DECODE_TEXT_STRICT = "mail.mime.decodetext.strict";
61      private static final String MIME_FOLDTEXT = "mail.mime.foldtext";
62      private static final int FOLD_THRESHOLD = 76;
63  
64      private MimeUtility() {
65      }
66  
67      public static final int ALL = -1;
68  
69      private static String defaultJavaCharset;
70      private static String escapedChars = "\"\\\r\n";
71      private static String linearWhiteSpace = " \t\r\n";
72  
73      private static String QP_WORD_SPECIALS = "=_?\"#$%&'(),.:;<>@[\\]^`{|}~";
74      private static String QP_TEXT_SPECIALS = "=_?";
75  
76      // the javamail spec includes the ability to map java encoding names to MIME-specified names.  Normally,
77      // these values are loaded from a character mapping file.
78      private static Map java2mime;
79      private static Map mime2java;
80  
81      static {
82          // we need to load the mapping tables used by javaCharset() and mimeCharset().
83          loadCharacterSetMappings();
84      }
85  
86      public static InputStream decode(InputStream in, String encoding) throws MessagingException {
87          encoding = encoding.toLowerCase();
88  
89          // some encodies are just pass-throughs, with no real decoding.
90          if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
91              return in;
92          }
93          else if (encoding.equals("base64")) {
94              return new Base64DecoderStream(in);
95          }
96          // UUEncode is known by a couple historical extension names too.
97          else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
98              return new UUDecoderStream(in);
99          }
100         else if (encoding.equals("quoted-printable")) {
101             return new QuotedPrintableDecoderStream(in);
102         }
103         else {
104             throw new MessagingException("Unknown encoding " + encoding);
105         }
106     }
107 
108     /**
109      * Decode a string of text obtained from a mail header into
110      * it's proper form.  The text generally will consist of a
111      * string of tokens, some of which may be encoded using
112      * base64 encoding.
113      *
114      * @param text   The text to decode.
115      *
116      * @return The decoded test string.
117      * @exception UnsupportedEncodingException
118      */
119     public static String decodeText(String text) throws UnsupportedEncodingException {
120         // if the text contains any encoded tokens, those tokens will be marked with "=?".  If the
121         // source string doesn't contain that sequent, no decoding is required.
122         if (text.indexOf("=?") < 0) {
123             return text;
124         }
125 
126         // we have two sets of rules we can apply.
127         if (!SessionUtil.getBooleanProperty(MIME_DECODE_TEXT_STRICT, true)) {
128             return decodeTextNonStrict(text);
129         }
130 
131         int offset = 0;
132         int endOffset = text.length();
133 
134         int startWhiteSpace = -1;
135         int endWhiteSpace = -1;
136 
137         StringBuffer decodedText = new StringBuffer(text.length());
138 
139         boolean previousTokenEncoded = false;
140 
141         while (offset < endOffset) {
142             char ch = text.charAt(offset);
143 
144             // is this a whitespace character?
145             if (linearWhiteSpace.indexOf(ch) != -1) {
146                 startWhiteSpace = offset;
147                 while (offset < endOffset) {
148                     // step over the white space characters.
149                     ch = text.charAt(offset);
150                     if (linearWhiteSpace.indexOf(ch) != -1) {
151                         offset++;
152                     }
153                     else {
154                         // record the location of the first non lwsp and drop down to process the
155                         // token characters.
156                         endWhiteSpace = offset;
157                         break;
158                     }
159                 }
160             }
161             else {
162                 // we have a word token.  We need to scan over the word and then try to parse it.
163                 int wordStart = offset;
164 
165                 while (offset < endOffset) {
166                     // step over the white space characters.
167                     ch = text.charAt(offset);
168                     if (linearWhiteSpace.indexOf(ch) == -1) {
169                         offset++;
170                     }
171                     else {
172                         break;
173                     }
174 
175                     //NB:  Trailing whitespace on these header strings will just be discarded.
176                 }
177                 // pull out the word token.
178                 String word = text.substring(wordStart, offset);
179                 // is the token encoded?  decode the word
180                 if (word.startsWith("=?")) {
181                     try {
182                         // if this gives a parsing failure, treat it like a non-encoded word.
183                         String decodedWord = decodeWord(word);
184 
185                         // are any whitespace characters significant?  Append 'em if we've got 'em.
186                         if (!previousTokenEncoded) {
187                             if (startWhiteSpace != -1) {
188                                 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
189                                 startWhiteSpace = -1;
190                             }
191                         }
192                         // this is definitely a decoded token.
193                         previousTokenEncoded = true;
194                         // and add this to the text.
195                         decodedText.append(decodedWord);
196                         // we continue parsing from here...we allow parsing errors to fall through
197                         // and get handled as normal text.
198                         continue;
199 
200                     } catch (ParseException e) {
201                     }
202                 }
203                 // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
204                 // if we have it.
205                 if (startWhiteSpace != -1) {
206                     decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
207                     startWhiteSpace = -1;
208                 }
209                 // this is not a decoded token.
210                 previousTokenEncoded = false;
211                 decodedText.append(word);
212             }
213         }
214 
215         return decodedText.toString();
216     }
217 
218 
219     /**
220      * Decode a string of text obtained from a mail header into
221      * it's proper form.  The text generally will consist of a
222      * string of tokens, some of which may be encoded using
223      * base64 encoding.  This is for non-strict decoded for mailers that
224      * violate the RFC 2047 restriction that decoded tokens must be delimited
225      * by linear white space.  This will scan tokens looking for inner tokens
226      * enclosed in "=?" -- "?=" pairs.
227      *
228      * @param text   The text to decode.
229      *
230      * @return The decoded test string.
231      * @exception UnsupportedEncodingException
232      */
233     private static String decodeTextNonStrict(String text) throws UnsupportedEncodingException {
234         int offset = 0;
235         int endOffset = text.length();
236 
237         int startWhiteSpace = -1;
238         int endWhiteSpace = -1;
239 
240         StringBuffer decodedText = new StringBuffer(text.length());
241 
242         boolean previousTokenEncoded = false;
243 
244         while (offset < endOffset) {
245             char ch = text.charAt(offset);
246 
247             // is this a whitespace character?
248             if (linearWhiteSpace.indexOf(ch) != -1) {
249                 startWhiteSpace = offset;
250                 while (offset < endOffset) {
251                     // step over the white space characters.
252                     ch = text.charAt(offset);
253                     if (linearWhiteSpace.indexOf(ch) != -1) {
254                         offset++;
255                     }
256                     else {
257                         // record the location of the first non lwsp and drop down to process the
258                         // token characters.
259                         endWhiteSpace = offset;
260                         break;
261                     }
262                 }
263             }
264             else {
265                 // we're at the start of a word token.  We potentially need to break this up into subtokens
266                 int wordStart = offset;
267 
268                 while (offset < endOffset) {
269                     // step over the white space characters.
270                     ch = text.charAt(offset);
271                     if (linearWhiteSpace.indexOf(ch) == -1) {
272                         offset++;
273                     }
274                     else {
275                         break;
276                     }
277 
278                     //NB:  Trailing whitespace on these header strings will just be discarded.
279                 }
280                 // pull out the word token.
281                 String word = text.substring(wordStart, offset);
282 
283                 int decodeStart = 0;
284 
285                 // now scan and process each of the bits within here.
286                 while (decodeStart < word.length()) {
287                     int tokenStart = word.indexOf("=?", decodeStart);
288                     if (tokenStart == -1) {
289                         // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
290                         // if we have it.
291                         if (startWhiteSpace != -1) {
292                             decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
293                             startWhiteSpace = -1;
294                         }
295                         // this is not a decoded token.
296                         previousTokenEncoded = false;
297                         decodedText.append(word.substring(decodeStart));
298                         // we're finished.
299                         break;
300                     }
301                     // we have something to process
302                     else {
303                         // we might have a normal token preceeding this.
304                         if (tokenStart != decodeStart) {
305                             // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
306                             // if we have it.
307                             if (startWhiteSpace != -1) {
308                                 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
309                                 startWhiteSpace = -1;
310                             }
311                             // this is not a decoded token.
312                             previousTokenEncoded = false;
313                             decodedText.append(word.substring(decodeStart, tokenStart));
314                         }
315 
316                         // now find the end marker.
317                         int tokenEnd = word.indexOf("?=", tokenStart);
318                         // sigh, an invalid token.  Treat this as plain text.
319                         if (tokenEnd == -1) {
320                             // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
321                             // if we have it.
322                             if (startWhiteSpace != -1) {
323                                 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
324                                 startWhiteSpace = -1;
325                             }
326                             // this is not a decoded token.
327                             previousTokenEncoded = false;
328                             decodedText.append(word.substring(tokenStart));
329                             // we're finished.
330                             break;
331                         }
332                         else {
333                             // update our ticker
334                             decodeStart = tokenEnd + 2;
335 
336                             String token = word.substring(tokenStart, tokenEnd);
337                             try {
338                                 // if this gives a parsing failure, treat it like a non-encoded word.
339                                 String decodedWord = decodeWord(token);
340 
341                                 // are any whitespace characters significant?  Append 'em if we've got 'em.
342                                 if (!previousTokenEncoded) {
343                                     if (startWhiteSpace != -1) {
344                                         decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
345                                         startWhiteSpace = -1;
346                                     }
347                                 }
348                                 // this is definitely a decoded token.
349                                 previousTokenEncoded = true;
350                                 // and add this to the text.
351                                 decodedText.append(decodedWord);
352                                 // we continue parsing from here...we allow parsing errors to fall through
353                                 // and get handled as normal text.
354                                 continue;
355 
356                             } catch (ParseException e) {
357                             }
358                             // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
359                             // if we have it.
360                             if (startWhiteSpace != -1) {
361                                 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
362                                 startWhiteSpace = -1;
363                             }
364                             // this is not a decoded token.
365                             previousTokenEncoded = false;
366                             decodedText.append(token);
367                         }
368                     }
369                 }
370             }
371         }
372 
373         return decodedText.toString();
374     }
375 
376     /**
377      * Parse a string using the RFC 2047 rules for an "encoded-word"
378      * type.  This encoding has the syntax:
379      *
380      * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
381      *
382      * @param word   The possibly encoded word value.
383      *
384      * @return The decoded word.
385      * @exception ParseException
386      * @exception UnsupportedEncodingException
387      */
388     public static String decodeWord(String word) throws ParseException, UnsupportedEncodingException {
389         // encoded words start with the characters "=?".  If this not an encoded word, we throw a
390         // ParseException for the caller.
391 
392         if (!word.startsWith("=?")) {
393             throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
394         }
395 
396         int charsetPos = word.indexOf('?', 2);
397         if (charsetPos == -1) {
398             throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
399         }
400 
401         // pull out the character set information (this is the MIME name at this point).
402         String charset = word.substring(2, charsetPos).toLowerCase();
403 
404         // now pull out the encoding token the same way.
405         int encodingPos = word.indexOf('?', charsetPos + 1);
406         if (encodingPos == -1) {
407             throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
408         }
409 
410         String encoding = word.substring(charsetPos + 1, encodingPos);
411 
412         // and finally the encoded text.
413         int encodedTextPos = word.indexOf("?=", encodingPos + 1);
414         if (encodedTextPos == -1) {
415             throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
416         }
417 
418         String encodedText = word.substring(encodingPos + 1, encodedTextPos);
419 
420         // seems a bit silly to encode a null string, but easy to deal with.
421         if (encodedText.length() == 0) {
422             return "";
423         }
424 
425         try {
426             // the decoder writes directly to an output stream.
427             ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());
428 
429             byte[] encodedData = encodedText.getBytes("US-ASCII");
430 
431             // Base64 encoded?
432             if (encoding.equals("B")) {
433                 Base64.decode(encodedData, out);
434             }
435             // maybe quoted printable.
436             else if (encoding.equals("Q")) {
437                 QuotedPrintableEncoder dataEncoder = new QuotedPrintableEncoder();
438                 dataEncoder.decodeWord(encodedData, out);
439             }
440             else {
441                 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
442             }
443             // get the decoded byte data and convert into a string.
444             byte[] decodedData = out.toByteArray();
445             return new String(decodedData, javaCharset(charset));
446         } catch (IOException e) {
447             throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
448         }
449 
450     }
451 
452     /**
453      * Wrap an encoder around a given output stream.
454      *
455      * @param out      The output stream to wrap.
456      * @param encoding The name of the encoding.
457      *
458      * @return A instance of FilterOutputStream that manages on the fly
459      *         encoding for the requested encoding type.
460      * @exception MessagingException
461      */
462     public static OutputStream encode(OutputStream out, String encoding) throws MessagingException {
463         // no encoding specified, so assume it goes out unchanged.
464         if (encoding == null) {
465             return out;
466         }
467 
468         encoding = encoding.toLowerCase();
469 
470         // some encodies are just pass-throughs, with no real decoding.
471         if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
472             return out;
473         }
474         else if (encoding.equals("base64")) {
475             return new Base64EncoderStream(out);
476         }
477         // UUEncode is known by a couple historical extension names too.
478         else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
479             return new UUEncoderStream(out);
480         }
481         else if (encoding.equals("quoted-printable")) {
482             return new QuotedPrintableEncoderStream(out);
483         }
484         else {
485             throw new MessagingException("Unknown encoding " + encoding);
486         }
487     }
488 
489     /**
490      * Wrap an encoder around a given output stream.
491      *
492      * @param out      The output stream to wrap.
493      * @param encoding The name of the encoding.
494      * @param filename The filename of the data being sent (only used for UUEncode).
495      *
496      * @return A instance of FilterOutputStream that manages on the fly
497      *         encoding for the requested encoding type.
498      * @exception MessagingException
499      */
500     public static OutputStream encode(OutputStream out, String encoding, String filename) throws MessagingException {
501         encoding = encoding.toLowerCase();
502 
503         // some encodies are just pass-throughs, with no real decoding.
504         if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
505             return out;
506         }
507         else if (encoding.equals("base64")) {
508             return new Base64EncoderStream(out);
509         }
510         // UUEncode is known by a couple historical extension names too.
511         else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
512             return new UUEncoderStream(out, filename);
513         }
514         else if (encoding.equals("quoted-printable")) {
515              return new QuotedPrintableEncoderStream(out);
516         }
517         else {
518             throw new MessagingException("Unknown encoding " + encoding);
519         }
520     }
521 
522 
523     public static String encodeText(String word) throws UnsupportedEncodingException {
524         return encodeText(word, null, null);
525     }
526 
527     public static String encodeText(String word, String charset, String encoding) throws UnsupportedEncodingException {
528         return encodeWord(word, charset, encoding, false);
529     }
530 
531     public static String encodeWord(String word) throws UnsupportedEncodingException {
532         return encodeWord(word, null, null);
533     }
534 
535     public static String encodeWord(String word, String charset, String encoding) throws UnsupportedEncodingException {
536         return encodeWord(word, charset, encoding, true);
537     }
538 
539 
540     private static String encodeWord(String word, String charset, String encoding, boolean encodingWord) throws UnsupportedEncodingException {
541 
542         // figure out what we need to encode this.
543         String encoder = ASCIIUtil.getTextTransferEncoding(word);
544         // all ascii?  We can return this directly,
545         if (encoder.equals("7bit")) {
546             return word;
547         }
548 
549         // if not given a charset, use the default.
550         if (charset == null) {
551             charset = getDefaultMIMECharset();
552         }
553 
554         // sort out the encoder.  If not explicitly given, use the best guess we've already established.
555         if (encoding != null) {
556             if (encoding.equalsIgnoreCase("B")) {
557                 encoder = "base64";
558             }
559             else if (encoding.equalsIgnoreCase("Q")) {
560                 encoder = "quoted-printable";
561             }
562             else {
563                 throw new UnsupportedEncodingException("Unknown transfer encoding: " + encoding);
564             }
565         }
566 
567         try {
568             // get the string bytes in the correct source charset
569             InputStream in = new ByteArrayInputStream(word.getBytes( javaCharset(charset)));
570             ByteArrayOutputStream out = new ByteArrayOutputStream();
571 
572             if (encoder.equals("base64")) {
573                 Base64Encoder dataEncoder = new Base64Encoder();
574                 dataEncoder.encodeWord(in, charset, out, SessionUtil.getBooleanProperty(MIME_FOLDENCODEDWORDS, false));
575             }
576             else {
577                 QuotedPrintableEncoder dataEncoder = new QuotedPrintableEncoder();
578                 dataEncoder.encodeWord(in, charset, encodingWord ? QP_WORD_SPECIALS : QP_TEXT_SPECIALS, out, SessionUtil.getBooleanProperty(MIME_FOLDENCODEDWORDS, false));
579             }
580 
581             byte[] bytes = out.toByteArray();
582             return new String(bytes);
583         } catch (IOException e) {
584             throw new UnsupportedEncodingException("Invalid encoding");
585         }
586     }
587 
588 
589     /**
590      * Examine the content of a data source and decide what type
591      * of transfer encoding should be used.  For text streams,
592      * we'll decided between 7bit, quoted-printable, and base64.
593      * For binary content types, we'll use either 7bit or base64.
594      *
595      * @param handler The DataHandler associated with the content.
596      *
597      * @return The string name of an encoding used to transfer the content.
598      */
599     public static String getEncoding(DataHandler handler) {
600 
601 
602         // if this handler has an associated data source, we can read directly from the
603         // data source to make this judgment.  This is generally MUCH faster than asking the
604         // DataHandler to write out the data for us.
605         DataSource ds = handler.getDataSource();
606         if (ds != null) {
607             return getEncoding(ds);
608         }
609 
610         try {
611             // get a parser that allows us to make comparisons.
612             ContentType content = new ContentType(ds.getContentType());
613 
614             // The only access to the content bytes at this point is by asking the handler to write
615             // the information out to a stream.  We're going to pipe this through a special stream
616             // that examines the bytes as they go by.
617             ContentCheckingOutputStream checker = new ContentCheckingOutputStream();
618 
619             handler.writeTo(checker);
620 
621             // figure this out based on whether we believe this to be a text type or not.
622             if (content.match("text/*")) {
623                 return checker.getTextTransferEncoding();
624             }
625             else {
626                 return checker.getBinaryTransferEncoding();
627             }
628 
629         } catch (Exception e) {
630             // any unexpected I/O exceptions we'll force to a "safe" fallback position.
631             return "base64";
632         }
633     }
634 
635 
636     /**
637      * Determine the what transfer encoding should be used for
638      * data retrieved from a DataSource.
639      *
640      * @param source The DataSource for the transmitted data.
641      *
642      * @return The string name of the encoding form that should be used for
643      *         the data.
644      */
645     public static String getEncoding(DataSource source) {
646         InputStream in = null;
647 
648         try {
649             // get a parser that allows us to make comparisons.
650             ContentType content = new ContentType(source.getContentType());
651 
652             // we're probably going to have to scan the data.
653             in = source.getInputStream();
654 
655             if (!content.match("text/*")) {
656                 // Not purporting to be a text type?  Examine the content to see we might be able to
657                 // at least pretend it is an ascii type.
658                 return ASCIIUtil.getBinaryTransferEncoding(in);
659             }
660             else {
661                 return ASCIIUtil.getTextTransferEncoding(in);
662             }
663         } catch (Exception e) {
664             // this was a problem...not sure what makes sense here, so we'll assume it's binary
665             // and we need to transfer this using Base64 encoding.
666             return "base64";
667         } finally {
668             // make sure we close the stream
669             try {
670                 if (in != null) {
671                     in.close();
672                 }
673             } catch (IOException e) {
674             }
675         }
676     }
677 
678 
679     /**
680      * Quote a "word" value.  If the word contains any character from
681      * the specified "specials" list, this value is returned as a
682      * quoted strong.  Otherwise, it is returned unchanged (an "atom").
683      *
684      * @param word     The word requiring quoting.
685      * @param specials The set of special characters that can't appear in an unquoted
686      *                 string.
687      *
688      * @return The quoted value.  This will be unchanged if the word doesn't contain
689      *         any of the designated special characters.
690      */
691     public static String quote(String word, String specials) {
692         int wordLength = word.length();
693         boolean requiresQuoting = false;
694         // scan the string looking for problem characters
695         for (int i =0; i < wordLength; i++) {
696             char ch = word.charAt(i);
697             // special escaped characters require escaping, which also implies quoting.
698             if (escapedChars.indexOf(ch) >= 0) {
699                 return quoteAndEscapeString(word);
700             }
701             // now check for control characters or the designated special characters.
702             if (ch < 32 || ch >= 127 || specials.indexOf(ch) >= 0) {
703                 // we know this requires quoting, but we still need to scan the entire string to
704                 // see if contains chars that require escaping.  Just go ahead and treat it as if it does.
705                 return quoteAndEscapeString(word);
706             }
707         }
708         return word;
709     }
710 
711     /**
712      * Take a string and return it as a formatted quoted string, with
713      * all characters requiring escaping handled properly.
714      *
715      * @param word   The string to quote.
716      *
717      * @return The quoted string.
718      */
719     private static String quoteAndEscapeString(String word) {
720         int wordLength = word.length();
721         // allocate at least enough for the string and two quotes plus a reasonable number of escaped chars.
722         StringBuffer buffer = new StringBuffer(wordLength + 10);
723         // add the leading quote.
724         buffer.append('"');
725 
726         for (int i = 0; i < wordLength; i++) {
727             char ch = word.charAt(i);
728             // is this an escaped char?
729             if (escapedChars.indexOf(ch) >= 0) {
730                 // add the escape marker before appending.
731                 buffer.append('\\');
732             }
733             buffer.append(ch);
734         }
735         // now the closing quote
736         buffer.append('"');
737         return buffer.toString();
738     }
739 
740     /**
741      * Translate a MIME standard character set name into the Java
742      * equivalent.
743      *
744      * @param charset The MIME standard name.
745      *
746      * @return The Java equivalent for this name.
747      */
748     public static String javaCharset(String charset) {
749         // nothing in, nothing out.
750         if (charset == null) {
751             return null;
752         }
753 
754         String mappedCharset = (String)mime2java.get(charset.toLowerCase());
755         // if there is no mapping, then the original name is used.  Many of the MIME character set
756         // names map directly back into Java.  The reverse isn't necessarily true.
757         return mappedCharset == null ? charset : mappedCharset;
758     }
759 
760     /**
761      * Map a Java character set name into the MIME equivalent.
762      *
763      * @param charset The java character set name.
764      *
765      * @return The MIME standard equivalent for this character set name.
766      */
767     public static String mimeCharset(String charset) {
768         // nothing in, nothing out.
769         if (charset == null) {
770             return null;
771         }
772 
773         String mappedCharset = (String)java2mime.get(charset.toLowerCase());
774         // if there is no mapping, then the original name is used.  Many of the MIME character set
775         // names map directly back into Java.  The reverse isn't necessarily true.
776         return mappedCharset == null ? charset : mappedCharset;
777     }
778 
779 
780     /**
781      * Get the default character set to use, in Java name format.
782      * This either be the value set with the mail.mime.charset
783      * system property or obtained from the file.encoding system
784      * property.  If neither of these is set, we fall back to
785      * 8859_1 (basically US-ASCII).
786      *
787      * @return The character string value of the default character set.
788      */
789     public static String getDefaultJavaCharset() {
790         String charset = SessionUtil.getProperty("mail.mime.charset");
791         if (charset != null) {
792             return javaCharset(charset);
793         }
794         return SessionUtil.getProperty("file.encoding", "8859_1");
795     }
796 
797     /**
798      * Get the default character set to use, in MIME name format.
799      * This either be the value set with the mail.mime.charset
800      * system property or obtained from the file.encoding system
801      * property.  If neither of these is set, we fall back to
802      * 8859_1 (basically US-ASCII).
803      *
804      * @return The character string value of the default character set.
805      */
806     static String getDefaultMIMECharset() {
807         // if the property is specified, this can be used directly.
808         String charset = SessionUtil.getProperty("mail.mime.charset");
809         if (charset != null) {
810             return charset;
811         }
812 
813         // get the Java-defined default and map back to a MIME name.
814         return mimeCharset(SessionUtil.getProperty("file.encoding", "8859_1"));
815     }
816 
817 
818     /**
819      * Load the default mapping tables used by the javaCharset()
820      * and mimeCharset() methods.  By default, these tables are
821      * loaded from the /META-INF/javamail.charset.map file.  If
822      * something goes wrong loading that file, we configure things
823      * with a default mapping table (which just happens to mimic
824      * what's in the default mapping file).
825      */
826     static private void loadCharacterSetMappings() {
827         java2mime = new HashMap();
828         mime2java = new HashMap();
829 
830 
831         // normally, these come from a character map file contained in the jar file.
832         try {
833             InputStream map = javax.mail.internet.MimeUtility.class.getResourceAsStream("/META-INF/javamail.charset.map");
834 
835             if (map != null) {
836                 // get a reader for this so we can load.
837                 BufferedReader reader = new BufferedReader(new InputStreamReader(map));
838 
839                 readMappings(reader, java2mime);
840                 readMappings(reader, mime2java);
841             }
842         } catch (Exception e) {
843         }
844 
845         // if any sort of error occurred reading the preferred file version, we could end up with empty
846         // mapping tables.  This could cause all sorts of difficulty, so ensure they are populated with at
847         // least a reasonable set of defaults.
848 
849         // these mappings echo what's in the default file.
850         if (java2mime.isEmpty()) {
851             java2mime.put("8859_1", "ISO-8859-1");
852             java2mime.put("iso8859_1", "ISO-8859-1");
853             java2mime.put("iso8859-1", "ISO-8859-1");
854 
855             java2mime.put("8859_2", "ISO-8859-2");
856             java2mime.put("iso8859_2", "ISO-8859-2");
857             java2mime.put("iso8859-2", "ISO-8859-2");
858 
859             java2mime.put("8859_3", "ISO-8859-3");
860             java2mime.put("iso8859_3", "ISO-8859-3");
861             java2mime.put("iso8859-3", "ISO-8859-3");
862 
863             java2mime.put("8859_4", "ISO-8859-4");
864             java2mime.put("iso8859_4", "ISO-8859-4");
865             java2mime.put("iso8859-4", "ISO-8859-4");
866 
867             java2mime.put("8859_5", "ISO-8859-5");
868             java2mime.put("iso8859_5", "ISO-8859-5");
869             java2mime.put("iso8859-5", "ISO-8859-5");
870 
871             java2mime.put ("8859_6", "ISO-8859-6");
872             java2mime.put("iso8859_6", "ISO-8859-6");
873             java2mime.put("iso8859-6", "ISO-8859-6");
874 
875             java2mime.put("8859_7", "ISO-8859-7");
876             java2mime.put("iso8859_7", "ISO-8859-7");
877             java2mime.put("iso8859-7", "ISO-8859-7");
878 
879             java2mime.put("8859_8", "ISO-8859-8");
880             java2mime.put("iso8859_8", "ISO-8859-8");
881             java2mime.put("iso8859-8", "ISO-8859-8");
882 
883             java2mime.put("8859_9", "ISO-8859-9");
884             java2mime.put("iso8859_9", "ISO-8859-9");
885             java2mime.put("iso8859-9", "ISO-8859-9");
886 
887             java2mime.put("sjis", "Shift_JIS");
888             java2mime.put ("jis", "ISO-2022-JP");
889             java2mime.put("iso2022jp", "ISO-2022-JP");
890             java2mime.put("euc_jp", "euc-jp");
891             java2mime.put("koi8_r", "koi8-r");
892             java2mime.put("euc_cn", "euc-cn");
893             java2mime.put("euc_tw", "euc-tw");
894             java2mime.put("euc_kr", "euc-kr");
895         }
896 
897         if (mime2java.isEmpty ()) {
898             mime2java.put("iso-2022-cn", "ISO2022CN");
899             mime2java.put("iso-2022-kr", "ISO2022KR");
900             mime2java.put("utf-8", "UTF8");
901             mime2java.put("utf8", "UTF8");
902             mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
903             mime2java.put("ja_jp.eucjp", "EUCJIS");
904             mime2java.put ("euc-kr", "KSC5601");
905             mime2java.put("euckr", "KSC5601");
906             mime2java.put("us-ascii", "ISO-8859-1");
907             mime2java.put("x-us-ascii", "ISO-8859-1");
908         }
909     }
910 
911 
912     /**
913      * Read a section of a character map table and populate the
914      * target mapping table with the information.  The table end
915      * is marked by a line starting with "--" and also ending with
916      * "--".  Blank lines and comment lines (beginning with '#') are
917      * ignored.
918      *
919      * @param reader The source of the file information.
920      * @param table  The mapping table used to store the information.
921      */
922     static private void readMappings(BufferedReader reader, Map table) throws IOException {
923         // process lines to the EOF or the end of table marker.
924         while (true) {
925             String line = reader.readLine();
926             // no line returned is an EOF
927             if (line == null) {
928                 return;
929             }
930 
931             // trim so we're not messed up by trailing blanks
932             line = line.trim();
933 
934             if (line.length() == 0 || line.startsWith("#")) {
935                 continue;
936             }
937 
938             // stop processing if this is the end-of-table marker.
939             if (line.startsWith("--") && line.endsWith("--")) {
940                 return;
941             }
942 
943             // we allow either blanks or tabs as token delimiters.
944             StringTokenizer tokenizer = new StringTokenizer(line, " \t");
945 
946             try {
947                 String from = tokenizer.nextToken().toLowerCase();
948                 String to = tokenizer.nextToken();
949 
950                 table.put(from, to);
951             } catch (NoSuchElementException e) {
952                 // just ignore the line if invalid.
953             }
954         }
955     }
956 
957 
958     /**
959      * Perform RFC 2047 text folding on a string of text.
960      *
961      * @param used   The amount of text already "used up" on this line.  This is
962      *               typically the length of a message header that this text
963      *               get getting added to.
964      * @param s      The text to fold.
965      *
966      * @return The input text, with linebreaks inserted at appropriate fold points.
967      */
968     public static String fold(int used, String s) {
969         // if folding is disable, unfolding is also.  Return the string unchanged.
970         if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
971             return s;
972         }
973 
974         int end;
975 
976         // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs,
977         // and line break characters.
978         for (end = s.length() - 1; end >= 0; end--) {
979             int ch = s.charAt(end);
980             if (ch != ' ' && ch != '\t' ) {
981                 break;
982             }
983         }
984 
985         // did we actually find something to remove?  Shorten the String to the trimmed length
986         if (end != s.length() - 1) {
987             s = s.substring(0, end + 1);
988         }
989 
990         // does the string as it exists now not require folding?  We can just had that back right off.
991         if (s.length() + used <= FOLD_THRESHOLD) {
992             return s;
993         }
994 
995         // get a buffer for the length of the string, plus room for a few line breaks.
996         // these are soft line breaks, so we generally need more that just the line breaks (an escape +
997         // CR + LF + leading space on next line);
998         StringBuffer newString = new StringBuffer(s.length() + 8);
999 
1000 
1001         // now keep chopping this down until we've accomplished what we need.
1002         while (used + s.length() > FOLD_THRESHOLD) {
1003             int breakPoint = -1;
1004             char breakChar = 0;
1005 
1006             // now scan for the next place where we can break.
1007             for (int i = 0; i < s.length(); i++) {
1008                 // have we passed the fold limit?
1009                 if (used + i > FOLD_THRESHOLD) {
1010                     // if we've already seen a blank, then stop now.  Otherwise
1011                     // we keep going until we hit a fold point.
1012                     if (breakPoint != -1) {
1013                         break;
1014                     }
1015                 }
1016                 char ch = s.charAt(i);
1017 
1018                 // a white space character?
1019                 if (ch == ' ' || ch == '\t') {
1020                     // this might be a run of white space, so skip over those now.
1021                     breakPoint = i;
1022                     // we need to maintain the same character type after the inserted linebreak.
1023                     breakChar = ch;
1024                     i++;
1025                     while (i < s.length()) {
1026                         ch = s.charAt(i);
1027                         if (ch != ' ' && ch != '\t') {
1028                             break;
1029                         }
1030                         i++;
1031                     }
1032                 }
1033                 // found an embedded new line.  Escape this so that the unfolding process preserves it.
1034                 else if (ch == '\n') {
1035                     newString.append('\\');
1036                     newString.append('\n');
1037                 }
1038                 else if (ch == '\r') {
1039                     newString.append('\\');
1040                     newString.append('\n');
1041                     i++;
1042                     // if this is a CRLF pair, add the second char also
1043                     if (i < s.length() && s.charAt(i) == '\n') {
1044                         newString.append('\r');
1045                     }
1046                 }
1047 
1048             }
1049             // no fold point found, we punt, append the remainder and leave.
1050             if (breakPoint == -1) {
1051                 newString.append(s);
1052                 return newString.toString();
1053             }
1054             newString.append(s.substring(0, breakPoint));
1055             newString.append("\r\n");
1056             newString.append(breakChar);
1057             // chop the string
1058             s = s.substring(breakPoint + 1);
1059             // start again, and we've used the first char of the limit already with the whitespace char.
1060             used = 1;
1061         }
1062 
1063         // add on the remainder, and return
1064         newString.append(s);
1065         return newString.toString();
1066     }
1067 
1068     /**
1069      * Unfold a folded string.  The unfolding process will remove
1070      * any line breaks that are not escaped and which are also followed
1071      * by whitespace characters.
1072      *
1073      * @param s      The folded string.
1074      *
1075      * @return A new string with unfolding rules applied.
1076      */
1077     public static String unfold(String s) {
1078         // if folding is disable, unfolding is also.  Return the string unchanged.
1079         if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
1080             return s;
1081         }
1082 
1083         // if there are no line break characters in the string, we can just return this.
1084         if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) {
1085             return s;
1086         }
1087 
1088         // we need to scan and fix things up.
1089         int length = s.length();
1090 
1091         StringBuffer newString = new StringBuffer(length);
1092 
1093         // scan the entire string
1094         for (int i = 0; i < length; i++) {
1095             char ch = s.charAt(i);
1096 
1097             // we have a backslash.  In folded strings, escape characters are only processed as such if
1098             // they preceed line breaks.  Otherwise, we leave it be.
1099             if (ch == '\\') {
1100                 // escape at the very end?  Just add the character.
1101                 if (i == length - 1) {
1102                     newString.append(ch);
1103                 }
1104                 else {
1105                     int nextChar = s.charAt(i + 1);
1106 
1107                     // naked newline?  Add the new line to the buffer, and skip the escape char.
1108                     if (nextChar == '\n') {
1109                         newString.append('\n');
1110                         i++;
1111                     }
1112                     else if (nextChar == '\r') {
1113                         // just the CR left?  Add it, removing the escape.
1114                         if (i == length - 2 || s.charAt(i + 2) != '\r') {
1115                             newString.append('\r');
1116                             i++;
1117                         }
1118                         else {
1119                             // toss the escape, add both parts of the CRLF, and skip over two chars.
1120                             newString.append('\r');
1121                             newString.append('\n');
1122                             i += 2;
1123                         }
1124                     }
1125                     else {
1126                         // an escape for another purpose, just copy it over.
1127                         newString.append(ch);
1128                     }
1129                 }
1130             }
1131             // we have an unescaped line break
1132             else if (ch == '\n' || ch == '\r') {
1133                 // remember the position in case we need to backtrack.
1134                 int lineBreak = i;
1135                 boolean CRLF = false;
1136 
1137                 if (ch == '\r') {
1138                     // check to see if we need to step over this.
1139                     if (i < length - 1 && s.charAt(i + 1) == '\n') {
1140                         i++;
1141                         // flag the type so we know what we might need to preserve.
1142                         CRLF = true;
1143                     }
1144                 }
1145 
1146                 // get a temp position scanner.
1147                 int scan = i + 1;
1148 
1149                 // does a blank follow this new line?  we need to scrap the new line and reduce the leading blanks
1150                 // down to a single blank.
1151                 if (scan < length && s.charAt(scan) == ' ') {
1152                     // add the character
1153                     newString.append(' ');
1154 
1155                     // scan over the rest of the blanks
1156                     i = scan + 1;
1157                     while (i < length && s.charAt(i) == ' ') {
1158                         i++;
1159                     }
1160                     // we'll increment down below, so back up to the last blank as the current char.
1161                     i--;
1162                 }
1163                 else {
1164                     // we must keep this line break.  Append the appropriate style.
1165                     if (CRLF) {
1166                         newString.append("\r\n");
1167                     }
1168                     else {
1169                         newString.append(ch);
1170                     }
1171                 }
1172             }
1173             else {
1174                 // just a normal, ordinary character
1175                 newString.append(ch);
1176             }
1177         }
1178         return newString.toString();
1179     }
1180 }
1181 
1182 
1183 /**
1184  * Utility class for examining content information written out
1185  * by a DataHandler object.  This stream gathers statistics on
1186  * the stream so it can make transfer encoding determinations.
1187  */
1188 class ContentCheckingOutputStream extends OutputStream {
1189     private int asciiChars = 0;
1190     private int nonAsciiChars = 0;
1191     private boolean containsLongLines = false;
1192     private boolean containsMalformedEOL = false;
1193     private int previousChar = 0;
1194     private int span = 0;
1195 
1196     ContentCheckingOutputStream() {
1197     }
1198 
1199     public void write(byte[] data) throws IOException {
1200         write(data, 0, data.length);
1201     }
1202 
1203     public void write(byte[] data, int offset, int length) throws IOException {
1204         for (int i = 0; i < length; i++) {
1205             write(data[offset + i]);
1206         }
1207     }
1208 
1209     public void write(int ch) {
1210         // we found a linebreak.  Reset the line length counters on either one.  We don't
1211         // really need to validate here.
1212         if (ch == '\n' || ch == '\r') {
1213             // we found a newline, this is only valid if the previous char was the '\r'
1214             if (ch == '\n') {
1215                 // malformed linebreak?  force this to base64 encoding.
1216                 if (previousChar != '\r') {
1217                     containsMalformedEOL = true;
1218                 }
1219             }
1220             // hit a line end, reset our line length counter
1221             span = 0;
1222         }
1223         else {
1224             span++;
1225             // the text has long lines, we can't transfer this as unencoded text.
1226             if (span > 998) {
1227                 containsLongLines = true;
1228             }
1229 
1230             // non-ascii character, we have to transfer this in binary.
1231             if (!ASCIIUtil.isAscii(ch)) {
1232                 nonAsciiChars++;
1233             }
1234             else {
1235                 asciiChars++;
1236             }
1237         }
1238         previousChar = ch;
1239     }
1240 
1241 
1242     public String getBinaryTransferEncoding() {
1243         if (nonAsciiChars != 0 || containsLongLines || containsMalformedEOL) {
1244             return "base64";
1245         }
1246         else {
1247             return "7bit";
1248         }
1249     }
1250 
1251     public String getTextTransferEncoding() {
1252         // looking good so far, only valid chars here.
1253         if (nonAsciiChars == 0) {
1254             // does this contain long text lines?  We need to use a Q-P encoding which will
1255             // be only slightly longer, but handles folding the longer lines.
1256             if (containsLongLines) {
1257                 return "quoted-printable";
1258             }
1259             else {
1260                 // ideal!  Easiest one to handle.
1261                 return "7bit";
1262             }
1263         }
1264         else {
1265             // mostly characters requiring encoding?  Base64 is our best bet.
1266             if (nonAsciiChars > asciiChars) {
1267                 return "base64";
1268             }
1269             else {
1270                 // Q-P encoding will use fewer bytes than the full Base64.
1271                 return "quoted-printable";
1272             }
1273         }
1274     }
1275 }