MimeUtility xref

View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *  http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package javax.mail.internet;
21  
22  import java.io.BufferedInputStream;
23  import java.io.BufferedReader;
24  import java.io.ByteArrayInputStream;
25  import java.io.ByteArrayOutputStream;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.InputStreamReader;
29  import java.io.OutputStream;
30  import java.io.UnsupportedEncodingException;
31  import java.util.HashMap;
32  import java.util.Map;
33  import java.util.NoSuchElementException;
34  import java.util.StringTokenizer;
35  
36  import javax.activation.DataHandler;
37  import javax.activation.DataSource;
38  import javax.mail.MessagingException;
39  
40  import org.apache.geronimo.mail.util.ASCIIUtil;
41  import org.apache.geronimo.mail.util.Base64;
42  import org.apache.geronimo.mail.util.Base64DecoderStream;
43  import org.apache.geronimo.mail.util.Base64Encoder;
44  import org.apache.geronimo.mail.util.Base64EncoderStream;
45  import org.apache.geronimo.mail.util.QuotedPrintableDecoderStream;
46  import org.apache.geronimo.mail.util.QuotedPrintableEncoderStream;
47  import org.apache.geronimo.mail.util.QuotedPrintableEncoder;
48  import org.apache.geronimo.mail.util.QuotedPrintable;
49  import org.apache.geronimo.mail.util.SessionUtil;
50  import org.apache.geronimo.mail.util.UUDecoderStream;
51  import org.apache.geronimo.mail.util.UUEncoderStream;
52  
53  // encodings include "base64", "quoted-printable", "7bit", "8bit" and "binary".
54  // In addition, "uuencode" is also supported. The
55  
56  /**
57   * @version $Rev: 627556 $ $Date: 2008-02-13 13:27:22 -0500 (Wed, 13 Feb 2008) $
58   */
59  public class MimeUtility {
60  
61      private static final String MIME_FOLDENCODEDWORDS = "mail.mime.foldencodedwords";
62      private static final String MIME_DECODE_TEXT_STRICT = "mail.mime.decodetext.strict";
63      private static final String MIME_FOLDTEXT = "mail.mime.foldtext";
64      private static final int FOLD_THRESHOLD = 76;
65  
66      private MimeUtility() {
67      }
68  
69      public static final int ALL = -1;
70  
71      private static String defaultJavaCharset;
72      private static String escapedChars = "\"\\\r\n";
73      private static String linearWhiteSpace = " \t\r\n";
74  
75      private static String QP_WORD_SPECIALS = "=_?\"#$%&'(),.:;<>@[\\]^`{|}~";
76      private static String QP_TEXT_SPECIALS = "=_?";
77  
78      // the javamail spec includes the ability to map java encoding names to MIME-specified names.  Normally,
79      // these values are loaded from a character mapping file.
80      private static Map java2mime;
81      private static Map mime2java;
82  
83      static {
84          // we need to load the mapping tables used by javaCharset() and mimeCharset().
85          loadCharacterSetMappings();
86      }
87  
88      public static InputStream decode(InputStream in, String encoding) throws MessagingException {
89          encoding = encoding.toLowerCase();
90  
91          // some encodies are just pass-throughs, with no real decoding.
92          if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
93              return in;
94          }
95          else if (encoding.equals("base64")) {
96              return new Base64DecoderStream(in);
97          }
98          // UUEncode is known by a couple historical extension names too.
99          else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
100             return new UUDecoderStream(in);
101         }
102         else if (encoding.equals("quoted-printable")) {
103             return new QuotedPrintableDecoderStream(in);
104         }
105         else {
106             throw new MessagingException("Unknown encoding " + encoding);
107         }
108     }
109 
110     /**
111      * Decode a string of text obtained from a mail header into
112      * it's proper form.  The text generally will consist of a
113      * string of tokens, some of which may be encoded using
114      * base64 encoding.
115      *
116      * @param text   The text to decode.
117      *
118      * @return The decoded test string.
119      * @exception UnsupportedEncodingException
120      */
121     public static String decodeText(String text) throws UnsupportedEncodingException {
122         // if the text contains any encoded tokens, those tokens will be marked with "=?".  If the
123         // source string doesn't contain that sequent, no decoding is required.
124         if (text.indexOf("=?") < 0) {
125             return text;
126         }
127 
128         // we have two sets of rules we can apply.
129         if (!SessionUtil.getBooleanProperty(MIME_DECODE_TEXT_STRICT, true)) {
130             return decodeTextNonStrict(text);
131         }
132 
133         int offset = 0;
134         int endOffset = text.length();
135 
136         int startWhiteSpace = -1;
137         int endWhiteSpace = -1;
138 
139         StringBuffer decodedText = new StringBuffer(text.length());
140 
141         boolean previousTokenEncoded = false;
142 
143         while (offset < endOffset) {
144             char ch = text.charAt(offset);
145 
146             // is this a whitespace character?
147             if (linearWhiteSpace.indexOf(ch) != -1) {
148                 startWhiteSpace = offset;
149                 while (offset < endOffset) {
150                     // step over the white space characters.
151                     ch = text.charAt(offset);
152                     if (linearWhiteSpace.indexOf(ch) != -1) {
153                         offset++;
154                     }
155                     else {
156                         // record the location of the first non lwsp and drop down to process the
157                         // token characters.
158                         endWhiteSpace = offset;
159                         break;
160                     }
161                 }
162             }
163             else {
164                 // we have a word token.  We need to scan over the word and then try to parse it.
165                 int wordStart = offset;
166 
167                 while (offset < endOffset) {
168                     // step over the white space characters.
169                     ch = text.charAt(offset);
170                     if (linearWhiteSpace.indexOf(ch) == -1) {
171                         offset++;
172                     }
173                     else {
174                         break;
175                     }
176 
177                     //NB:  Trailing whitespace on these header strings will just be discarded.
178                 }
179                 // pull out the word token.
180                 String word = text.substring(wordStart, offset);
181                 // is the token encoded?  decode the word
182                 if (word.startsWith("=?")) {
183                     try {
184                         // if this gives a parsing failure, treat it like a non-encoded word.
185                         String decodedWord = decodeWord(word);
186 
187                         // are any whitespace characters significant?  Append 'em if we've got 'em.
188                         if (!previousTokenEncoded) {
189                             if (startWhiteSpace != -1) {
190                                 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
191                                 startWhiteSpace = -1;
192                             }
193                         }
194                         // this is definitely a decoded token.
195                         previousTokenEncoded = true;
196                         // and add this to the text.
197                         decodedText.append(decodedWord);
198                         // we continue parsing from here...we allow parsing errors to fall through
199                         // and get handled as normal text.
200                         continue;
201 
202                     } catch (ParseException e) {
203                     }
204                 }
205                 // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
206                 // if we have it.
207                 if (startWhiteSpace != -1) {
208                     decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
209                     startWhiteSpace = -1;
210                 }
211                 // this is not a decoded token.
212                 previousTokenEncoded = false;
213                 decodedText.append(word);
214             }
215         }
216 
217         return decodedText.toString();
218     }
219 
220 
221     /**
222      * Decode a string of text obtained from a mail header into
223      * it's proper form.  The text generally will consist of a
224      * string of tokens, some of which may be encoded using
225      * base64 encoding.  This is for non-strict decoded for mailers that
226      * violate the RFC 2047 restriction that decoded tokens must be delimited
227      * by linear white space.  This will scan tokens looking for inner tokens
228      * enclosed in "=?" -- "?=" pairs.
229      *
230      * @param text   The text to decode.
231      *
232      * @return The decoded test string.
233      * @exception UnsupportedEncodingException
234      */
235     private static String decodeTextNonStrict(String text) throws UnsupportedEncodingException {
236         int offset = 0;
237         int endOffset = text.length();
238 
239         int startWhiteSpace = -1;
240         int endWhiteSpace = -1;
241 
242         StringBuffer decodedText = new StringBuffer(text.length());
243 
244         boolean previousTokenEncoded = false;
245 
246         while (offset < endOffset) {
247             char ch = text.charAt(offset);
248 
249             // is this a whitespace character?
250             if (linearWhiteSpace.indexOf(ch) != -1) {
251                 startWhiteSpace = offset;
252                 while (offset < endOffset) {
253                     // step over the white space characters.
254                     ch = text.charAt(offset);
255                     if (linearWhiteSpace.indexOf(ch) != -1) {
256                         offset++;
257                     }
258                     else {
259                         // record the location of the first non lwsp and drop down to process the
260                         // token characters.
261                         endWhiteSpace = offset;
262                         break;
263                     }
264                 }
265             }
266             else {
267                 // we're at the start of a word token.  We potentially need to break this up into subtokens
268                 int wordStart = offset;
269 
270                 while (offset < endOffset) {
271                     // step over the white space characters.
272                     ch = text.charAt(offset);
273                     if (linearWhiteSpace.indexOf(ch) == -1) {
274                         offset++;
275                     }
276                     else {
277                         break;
278                     }
279 
280                     //NB:  Trailing whitespace on these header strings will just be discarded.
281                 }
282                 // pull out the word token.
283                 String word = text.substring(wordStart, offset);
284 
285                 int decodeStart = 0;
286 
287                 // now scan and process each of the bits within here.
288                 while (decodeStart < word.length()) {
289                     int tokenStart = word.indexOf("=?", decodeStart);
290                     if (tokenStart == -1) {
291                         // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
292                         // if we have it.
293                         if (startWhiteSpace != -1) {
294                             decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
295                             startWhiteSpace = -1;
296                         }
297                         // this is not a decoded token.
298                         previousTokenEncoded = false;
299                         decodedText.append(word.substring(decodeStart));
300                         // we're finished.
301                         break;
302                     }
303                     // we have something to process
304                     else {
305                         // we might have a normal token preceeding this.
306                         if (tokenStart != decodeStart) {
307                             // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
308                             // if we have it.
309                             if (startWhiteSpace != -1) {
310                                 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
311                                 startWhiteSpace = -1;
312                             }
313                             // this is not a decoded token.
314                             previousTokenEncoded = false;
315                             decodedText.append(word.substring(decodeStart, tokenStart));
316                         }
317 
318                         // now find the end marker.
319                         int tokenEnd = word.indexOf("?=", tokenStart);
320                         // sigh, an invalid token.  Treat this as plain text.
321                         if (tokenEnd == -1) {
322                             // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
323                             // if we have it.
324                             if (startWhiteSpace != -1) {
325                                 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
326                                 startWhiteSpace = -1;
327                             }
328                             // this is not a decoded token.
329                             previousTokenEncoded = false;
330                             decodedText.append(word.substring(tokenStart));
331                             // we're finished.
332                             break;
333                         }
334                         else {
335                             // update our ticker
336                             decodeStart = tokenEnd + 2;
337 
338                             String token = word.substring(tokenStart, tokenEnd);
339                             try {
340                                 // if this gives a parsing failure, treat it like a non-encoded word.
341                                 String decodedWord = decodeWord(token);
342 
343                                 // are any whitespace characters significant?  Append 'em if we've got 'em.
344                                 if (!previousTokenEncoded) {
345                                     if (startWhiteSpace != -1) {
346                                         decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
347                                         startWhiteSpace = -1;
348                                     }
349                                 }
350                                 // this is definitely a decoded token.
351                                 previousTokenEncoded = true;
352                                 // and add this to the text.
353                                 decodedText.append(decodedWord);
354                                 // we continue parsing from here...we allow parsing errors to fall through
355                                 // and get handled as normal text.
356                                 continue;
357 
358                             } catch (ParseException e) {
359                             }
360                             // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
361                             // if we have it.
362                             if (startWhiteSpace != -1) {
363                                 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
364                                 startWhiteSpace = -1;
365                             }
366                             // this is not a decoded token.
367                             previousTokenEncoded = false;
368                             decodedText.append(token);
369                         }
370                     }
371                 }
372             }
373         }
374 
375         return decodedText.toString();
376     }
377 
378     /**
379      * Parse a string using the RFC 2047 rules for an "encoded-word"
380      * type.  This encoding has the syntax:
381      *
382      * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
383      *
384      * @param word   The possibly encoded word value.
385      *
386      * @return The decoded word.
387      * @exception ParseException
388      * @exception UnsupportedEncodingException
389      */
390     public static String decodeWord(String word) throws ParseException, UnsupportedEncodingException {
391         // encoded words start with the characters "=?".  If this not an encoded word, we throw a
392         // ParseException for the caller.
393 
394         if (!word.startsWith("=?")) {
395             throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
396         }
397 
398         int charsetPos = word.indexOf('?', 2);
399         if (charsetPos == -1) {
400             throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
401         }
402 
403         // pull out the character set information (this is the MIME name at this point).
404         String charset = word.substring(2, charsetPos).toLowerCase();
405 
406         // now pull out the encoding token the same way.
407         int encodingPos = word.indexOf('?', charsetPos + 1);
408         if (encodingPos == -1) {
409             throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
410         }
411 
412         String encoding = word.substring(charsetPos + 1, encodingPos);
413 
414         // and finally the encoded text.
415         int encodedTextPos = word.indexOf("?=", encodingPos + 1);
416         if (encodedTextPos == -1) {
417             throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
418         }
419 
420         String encodedText = word.substring(encodingPos + 1, encodedTextPos);
421 
422         // seems a bit silly to encode a null string, but easy to deal with.
423         if (encodedText.length() == 0) {
424             return "";
425         }
426 
427         try {
428             // the decoder writes directly to an output stream.
429             ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());
430 
431             byte[] encodedData = encodedText.getBytes("US-ASCII");
432 
433             // Base64 encoded?
434             if (encoding.equals("B")) {
435                 Base64.decode(encodedData, out);
436             }
437             // maybe quoted printable.
438             else if (encoding.equals("Q")) {
439                 QuotedPrintableEncoder dataEncoder = new QuotedPrintableEncoder();
440                 dataEncoder.decodeWord(encodedData, out);
441             }
442             else {
443                 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
444             }
445             // get the decoded byte data and convert into a string.
446             byte[] decodedData = out.toByteArray();
447             return new String(decodedData, javaCharset(charset));
448         } catch (IOException e) {
449             throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
450         }
451 
452     }
453 
454     /**
455      * Wrap an encoder around a given output stream.
456      *
457      * @param out      The output stream to wrap.
458      * @param encoding The name of the encoding.
459      *
460      * @return A instance of FilterOutputStream that manages on the fly
461      *         encoding for the requested encoding type.
462      * @exception MessagingException
463      */
464     public static OutputStream encode(OutputStream out, String encoding) throws MessagingException {
465         // no encoding specified, so assume it goes out unchanged.
466         if (encoding == null) {
467             return out;
468         }
469 
470         encoding = encoding.toLowerCase();
471 
472         // some encodies are just pass-throughs, with no real decoding.
473         if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
474             return out;
475         }
476         else if (encoding.equals("base64")) {
477             return new Base64EncoderStream(out);
478         }
479         // UUEncode is known by a couple historical extension names too.
480         else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
481             return new UUEncoderStream(out);
482         }
483         else if (encoding.equals("quoted-printable")) {
484             return new QuotedPrintableEncoderStream(out);
485         }
486         else {
487             throw new MessagingException("Unknown encoding " + encoding);
488         }
489     }
490 
491     /**
492      * Wrap an encoder around a given output stream.
493      *
494      * @param out      The output stream to wrap.
495      * @param encoding The name of the encoding.
496      * @param filename The filename of the data being sent (only used for UUEncode).
497      *
498      * @return A instance of FilterOutputStream that manages on the fly
499      *         encoding for the requested encoding type.
500      * @exception MessagingException
501      */
502     public static OutputStream encode(OutputStream out, String encoding, String filename) throws MessagingException {
503         encoding = encoding.toLowerCase();
504 
505         // some encodies are just pass-throughs, with no real decoding.
506         if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) {
507             return out;
508         }
509         else if (encoding.equals("base64")) {
510             return new Base64EncoderStream(out);
511         }
512         // UUEncode is known by a couple historical extension names too.
513         else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) {
514             return new UUEncoderStream(out, filename);
515         }
516         else if (encoding.equals("quoted-printable")) {
517              return new QuotedPrintableEncoderStream(out);
518         }
519         else {
520             throw new MessagingException("Unknown encoding " + encoding);
521         }
522     }
523 
524 
525     public static String encodeText(String word) throws UnsupportedEncodingException {
526         return encodeText(word, null, null);
527     }
528 
529     public static String encodeText(String word, String charset, String encoding) throws UnsupportedEncodingException {
530         return encodeWord(word, charset, encoding, false);
531     }
532 
533     public static String encodeWord(String word) throws UnsupportedEncodingException {
534         return encodeWord(word, null, null);
535     }
536 
537     public static String encodeWord(String word, String charset, String encoding) throws UnsupportedEncodingException {
538         return encodeWord(word, charset, encoding, true);
539     }
540 
541 
542     private static String encodeWord(String word, String charset, String encoding, boolean encodingWord) throws UnsupportedEncodingException {
543 
544         // figure out what we need to encode this.
545         String encoder = ASCIIUtil.getTextTransferEncoding(word);
546         // all ascii?  We can return this directly,
547         if (encoder.equals("7bit")) {
548             return word;
549         }
550 
551         // if not given a charset, use the default.
552         if (charset == null) {
553             charset = getDefaultMIMECharset();
554         }
555 
556         // sort out the encoder.  If not explicitly given, use the best guess we've already established.
557         if (encoding != null) {
558             if (encoding.equalsIgnoreCase("B")) {
559                 encoder = "base64";
560             }
561             else if (encoding.equalsIgnoreCase("Q")) {
562                 encoder = "quoted-printable";
563             }
564             else {
565                 throw new UnsupportedEncodingException("Unknown transfer encoding: " + encoding);
566             }
567         }
568 
569         try {
570             
571             // we'll format this directly into the string buffer 
572             StringBuffer result = new StringBuffer(); 
573             
574             // this is the maximum size of a segment of encoded data, which is based off 
575             // of a 75 character size limit and all of the encoding overhead elements.
576             int sizeLimit = 75 - 7 - charset.length();
577             
578             // now do the appropriate encoding work 
579             if (encoder.equals("base64")) {
580                 Base64Encoder dataEncoder = new Base64Encoder();
581                 // this may recurse on the encoding if the string is too long.  The left-most will not 
582                 // get a segment delimiter 
583                 encodeBase64(word, result, sizeLimit, charset, dataEncoder, true, SessionUtil.getBooleanProperty(MIME_FOLDENCODEDWORDS, false)); 
584             }
585             else {
586                 QuotedPrintableEncoder dataEncoder = new QuotedPrintableEncoder();
587                 encodeQuotedPrintable(word, result, sizeLimit, charset, dataEncoder, true, 
588                     SessionUtil.getBooleanProperty(MIME_FOLDENCODEDWORDS, false), encodingWord ? QP_WORD_SPECIALS : QP_TEXT_SPECIALS); 
589             }
590             return result.toString();    
591         } catch (IOException e) {
592             throw new UnsupportedEncodingException("Invalid encoding");
593         }
594     }
595     
596     
597     /**
598      * Encode a string into base64 encoding, taking into 
599      * account the maximum segment length. 
600      * 
601      * @param data      The string data to encode.
602      * @param out       The output buffer used for the result.
603      * @param sizeLimit The maximum amount of encoded data we're allowed
604      *                  to have in a single encoded segment.
605      * @param charset   The character set marker that needs to be added to the
606      *                  encoding header.
607      * @param encoder   The encoder instance we're using.
608      * @param firstSegment
609      *                  If true, this is the first (left-most) segment in the
610      *                  data.  Used to determine if segment delimiters need to
611      *                  be added between sections.
612      * @param foldSegments
613      *                  Indicates the type of delimiter to use (blank or newline sequence).
614      */
615     static private void encodeBase64(String data, StringBuffer out, int sizeLimit, String charset, Base64Encoder encoder, boolean firstSegment, boolean foldSegments) throws IOException
616     {
617         // this needs to be converted into the appropriate transfer encoding. 
618         byte [] bytes = data.getBytes(javaCharset(charset)); 
619         
620         int estimatedSize = encoder.estimateEncodedLength(bytes); 
621         
622         // if the estimated encoding size is over our segment limit, split the string in half and 
623         // recurse.  Eventually we'll reach a point where things are small enough.  
624         if (estimatedSize > sizeLimit) {
625             // the first segment indicator travels with the left half. 
626             encodeBase64(data.substring(0, data.length() / 2), out, sizeLimit, charset, encoder, firstSegment, foldSegments);
627             // the second half can never be the first segment 
628             encodeBase64(data.substring(data.length() / 2), out, sizeLimit, charset, encoder, false, foldSegments);
629         }
630         else 
631         {
632             // if this is not the first sement of the encoding, we need to add either a blank or 
633             // a newline sequence to the data 
634             if (!firstSegment) {
635                 if (foldSegments) {
636                     out.append("\r\n"); 
637                 }
638                 else {
639                     out.append(' '); 
640                 }
641             }
642             // do the encoding of the segment.
643             encoder.encodeWord(bytes, out, charset);
644         }
645     }
646     
647     
648     /**
649      * Encode a string into quoted printable encoding, taking into 
650      * account the maximum segment length. 
651      * 
652      * @param data      The string data to encode.
653      * @param out       The output buffer used for the result.
654      * @param sizeLimit The maximum amount of encoded data we're allowed
655      *                  to have in a single encoded segment.
656      * @param charset   The character set marker that needs to be added to the
657      *                  encoding header.
658      * @param encoder   The encoder instance we're using.
659      * @param firstSegment
660      *                  If true, this is the first (left-most) segment in the
661      *                  data.  Used to determine if segment delimiters need to
662      *                  be added between sections.
663      * @param foldSegments
664      *                  Indicates the type of delimiter to use (blank or newline sequence).
665      */
666     static private void encodeQuotedPrintable(String data, StringBuffer out, int sizeLimit, String charset, QuotedPrintableEncoder encoder, 
667         boolean firstSegment, boolean foldSegments, String specials)  throws IOException 
668     {
669         // this needs to be converted into the appropriate transfer encoding. 
670         byte [] bytes = data.getBytes(javaCharset(charset)); 
671         
672         int estimatedSize = encoder.estimateEncodedLength(bytes, specials); 
673         
674         // if the estimated encoding size is over our segment limit, split the string in half and 
675         // recurse.  Eventually we'll reach a point where things are small enough.  
676         if (estimatedSize > sizeLimit) {
677             // the first segment indicator travels with the left half. 
678             encodeQuotedPrintable(data.substring(0, data.length() / 2), out, sizeLimit, charset, encoder, firstSegment, foldSegments, specials);
679             // the second half can never be the first segment 
680             encodeQuotedPrintable(data.substring(data.length() / 2), out, sizeLimit, charset, encoder, false, foldSegments, specials);
681         }
682         else 
683         {
684             // if this is not the first sement of the encoding, we need to add either a blank or 
685             // a newline sequence to the data 
686             if (!firstSegment) {
687                 if (foldSegments) {
688                     out.append("\r\n"); 
689                 }
690                 else {
691                     out.append(' '); 
692                 }
693             }
694             // do the encoding of the segment.
695             encoder.encodeWord(bytes, out, charset, specials);
696         }
697     }
698 
699 
700     /**
701      * Examine the content of a data source and decide what type
702      * of transfer encoding should be used.  For text streams,
703      * we'll decided between 7bit, quoted-printable, and base64.
704      * For binary content types, we'll use either 7bit or base64.
705      *
706      * @param handler The DataHandler associated with the content.
707      *
708      * @return The string name of an encoding used to transfer the content.
709      */
710     public static String getEncoding(DataHandler handler) {
711 
712 
713         // if this handler has an associated data source, we can read directly from the
714         // data source to make this judgment.  This is generally MUCH faster than asking the
715         // DataHandler to write out the data for us.
716         DataSource ds = handler.getDataSource();
717         if (ds != null) {
718             return getEncoding(ds);
719         }
720 
721         try {
722             // get a parser that allows us to make comparisons.
723             ContentType content = new ContentType(ds.getContentType());
724 
725             // The only access to the content bytes at this point is by asking the handler to write
726             // the information out to a stream.  We're going to pipe this through a special stream
727             // that examines the bytes as they go by.
728             ContentCheckingOutputStream checker = new ContentCheckingOutputStream();
729 
730             handler.writeTo(checker);
731 
732             // figure this out based on whether we believe this to be a text type or not.
733             if (content.match("text/*")) {
734                 return checker.getTextTransferEncoding();
735             }
736             else {
737                 return checker.getBinaryTransferEncoding();
738             }
739 
740         } catch (Exception e) {
741             // any unexpected I/O exceptions we'll force to a "safe" fallback position.
742             return "base64";
743         }
744     }
745 
746 
747     /**
748      * Determine the what transfer encoding should be used for
749      * data retrieved from a DataSource.
750      *
751      * @param source The DataSource for the transmitted data.
752      *
753      * @return The string name of the encoding form that should be used for
754      *         the data.
755      */
756     public static String getEncoding(DataSource source) {
757         InputStream in = null;
758 
759         try {
760             // get a parser that allows us to make comparisons.
761             ContentType content = new ContentType(source.getContentType());
762 
763             // we're probably going to have to scan the data.
764             in = source.getInputStream();
765 
766             if (!content.match("text/*")) {
767                 // Not purporting to be a text type?  Examine the content to see we might be able to
768                 // at least pretend it is an ascii type.
769                 return ASCIIUtil.getBinaryTransferEncoding(in);
770             }
771             else {
772                 return ASCIIUtil.getTextTransferEncoding(in);
773             }
774         } catch (Exception e) {
775             // this was a problem...not sure what makes sense here, so we'll assume it's binary
776             // and we need to transfer this using Base64 encoding.
777             return "base64";
778         } finally {
779             // make sure we close the stream
780             try {
781                 if (in != null) {
782                     in.close();
783                 }
784             } catch (IOException e) {
785             }
786         }
787     }
788 
789 
790     /**
791      * Quote a "word" value.  If the word contains any character from
792      * the specified "specials" list, this value is returned as a
793      * quoted strong.  Otherwise, it is returned unchanged (an "atom").
794      *
795      * @param word     The word requiring quoting.
796      * @param specials The set of special characters that can't appear in an unquoted
797      *                 string.
798      *
799      * @return The quoted value.  This will be unchanged if the word doesn't contain
800      *         any of the designated special characters.
801      */
802     public static String quote(String word, String specials) {
803         int wordLength = word.length();
804         boolean requiresQuoting = false;
805         // scan the string looking for problem characters
806         for (int i =0; i < wordLength; i++) {
807             char ch = word.charAt(i);
808             // special escaped characters require escaping, which also implies quoting.
809             if (escapedChars.indexOf(ch) >= 0) {
810                 return quoteAndEscapeString(word);
811             }
812             // now check for control characters or the designated special characters.
813             if (ch < 32 || ch >= 127 || specials.indexOf(ch) >= 0) {
814                 // we know this requires quoting, but we still need to scan the entire string to
815                 // see if contains chars that require escaping.  Just go ahead and treat it as if it does.
816                 return quoteAndEscapeString(word);
817             }
818         }
819         return word;
820     }
821 
822     /**
823      * Take a string and return it as a formatted quoted string, with
824      * all characters requiring escaping handled properly.
825      *
826      * @param word   The string to quote.
827      *
828      * @return The quoted string.
829      */
830     private static String quoteAndEscapeString(String word) {
831         int wordLength = word.length();
832         // allocate at least enough for the string and two quotes plus a reasonable number of escaped chars.
833         StringBuffer buffer = new StringBuffer(wordLength + 10);
834         // add the leading quote.
835         buffer.append('"');
836 
837         for (int i = 0; i < wordLength; i++) {
838             char ch = word.charAt(i);
839             // is this an escaped char?
840             if (escapedChars.indexOf(ch) >= 0) {
841                 // add the escape marker before appending.
842                 buffer.append('\\');
843             }
844             buffer.append(ch);
845         }
846         // now the closing quote
847         buffer.append('"');
848         return buffer.toString();
849     }
850 
851     /**
852      * Translate a MIME standard character set name into the Java
853      * equivalent.
854      *
855      * @param charset The MIME standard name.
856      *
857      * @return The Java equivalent for this name.
858      */
859     public static String javaCharset(String charset) {
860         // nothing in, nothing out.
861         if (charset == null) {
862             return null;
863         }
864 
865         String mappedCharset = (String)mime2java.get(charset.toLowerCase());
866         // if there is no mapping, then the original name is used.  Many of the MIME character set
867         // names map directly back into Java.  The reverse isn't necessarily true.
868         return mappedCharset == null ? charset : mappedCharset;
869     }
870 
871     /**
872      * Map a Java character set name into the MIME equivalent.
873      *
874      * @param charset The java character set name.
875      *
876      * @return The MIME standard equivalent for this character set name.
877      */
878     public static String mimeCharset(String charset) {
879         // nothing in, nothing out.
880         if (charset == null) {
881             return null;
882         }
883 
884         String mappedCharset = (String)java2mime.get(charset.toLowerCase());
885         // if there is no mapping, then the original name is used.  Many of the MIME character set
886         // names map directly back into Java.  The reverse isn't necessarily true.
887         return mappedCharset == null ? charset : mappedCharset;
888     }
889 
890 
891     /**
892      * Get the default character set to use, in Java name format.
893      * This either be the value set with the mail.mime.charset
894      * system property or obtained from the file.encoding system
895      * property.  If neither of these is set, we fall back to
896      * 8859_1 (basically US-ASCII).
897      *
898      * @return The character string value of the default character set.
899      */
900     public static String getDefaultJavaCharset() {
901         String charset = SessionUtil.getProperty("mail.mime.charset");
902         if (charset != null) {
903             return javaCharset(charset);
904         }
905         return SessionUtil.getProperty("file.encoding", "8859_1");
906     }
907 
908     /**
909      * Get the default character set to use, in MIME name format.
910      * This either be the value set with the mail.mime.charset
911      * system property or obtained from the file.encoding system
912      * property.  If neither of these is set, we fall back to
913      * 8859_1 (basically US-ASCII).
914      *
915      * @return The character string value of the default character set.
916      */
917     static String getDefaultMIMECharset() {
918         // if the property is specified, this can be used directly.
919         String charset = SessionUtil.getProperty("mail.mime.charset");
920         if (charset != null) {
921             return charset;
922         }
923 
924         // get the Java-defined default and map back to a MIME name.
925         return mimeCharset(SessionUtil.getProperty("file.encoding", "8859_1"));
926     }
927 
928 
929     /**
930      * Load the default mapping tables used by the javaCharset()
931      * and mimeCharset() methods.  By default, these tables are
932      * loaded from the /META-INF/javamail.charset.map file.  If
933      * something goes wrong loading that file, we configure things
934      * with a default mapping table (which just happens to mimic
935      * what's in the default mapping file).
936      */
937     static private void loadCharacterSetMappings() {
938         java2mime = new HashMap();
939         mime2java = new HashMap();
940 
941 
942         // normally, these come from a character map file contained in the jar file.
943         try {
944             InputStream map = javax.mail.internet.MimeUtility.class.getResourceAsStream("/META-INF/javamail.charset.map");
945 
946             if (map != null) {
947                 // get a reader for this so we can load.
948                 BufferedReader reader = new BufferedReader(new InputStreamReader(map));
949 
950                 readMappings(reader, java2mime);
951                 readMappings(reader, mime2java);
952             }
953         } catch (Exception e) {
954         }
955 
956         // if any sort of error occurred reading the preferred file version, we could end up with empty
957         // mapping tables.  This could cause all sorts of difficulty, so ensure they are populated with at
958         // least a reasonable set of defaults.
959 
960         // these mappings echo what's in the default file.
961         if (java2mime.isEmpty()) {
962             java2mime.put("8859_1", "ISO-8859-1");
963             java2mime.put("iso8859_1", "ISO-8859-1");
964             java2mime.put("iso8859-1", "ISO-8859-1");
965 
966             java2mime.put("8859_2", "ISO-8859-2");
967             java2mime.put("iso8859_2", "ISO-8859-2");
968             java2mime.put("iso8859-2", "ISO-8859-2");
969 
970             java2mime.put("8859_3", "ISO-8859-3");
971             java2mime.put("iso8859_3", "ISO-8859-3");
972             java2mime.put("iso8859-3", "ISO-8859-3");
973 
974             java2mime.put("8859_4", "ISO-8859-4");
975             java2mime.put("iso8859_4", "ISO-8859-4");
976             java2mime.put("iso8859-4", "ISO-8859-4");
977 
978             java2mime.put("8859_5", "ISO-8859-5");
979             java2mime.put("iso8859_5", "ISO-8859-5");
980             java2mime.put("iso8859-5", "ISO-8859-5");
981 
982             java2mime.put ("8859_6", "ISO-8859-6");
983             java2mime.put("iso8859_6", "ISO-8859-6");
984             java2mime.put("iso8859-6", "ISO-8859-6");
985 
986             java2mime.put("8859_7", "ISO-8859-7");
987             java2mime.put("iso8859_7", "ISO-8859-7");
988             java2mime.put("iso8859-7", "ISO-8859-7");
989 
990             java2mime.put("8859_8", "ISO-8859-8");
991             java2mime.put("iso8859_8", "ISO-8859-8");
992             java2mime.put("iso8859-8", "ISO-8859-8");
993 
994             java2mime.put("8859_9", "ISO-8859-9");
995             java2mime.put("iso8859_9", "ISO-8859-9");
996             java2mime.put("iso8859-9", "ISO-8859-9");
997 
998             java2mime.put("sjis", "Shift_JIS");
999             java2mime.put ("jis", "ISO-2022-JP");
1000             java2mime.put("iso2022jp", "ISO-2022-JP");
1001             java2mime.put("euc_jp", "euc-jp");
1002             java2mime.put("koi8_r", "koi8-r");
1003             java2mime.put("euc_cn", "euc-cn");
1004             java2mime.put("euc_tw", "euc-tw");
1005             java2mime.put("euc_kr", "euc-kr");
1006         }
1007 
1008         if (mime2java.isEmpty ()) {
1009             mime2java.put("iso-2022-cn", "ISO2022CN");
1010             mime2java.put("iso-2022-kr", "ISO2022KR");
1011             mime2java.put("utf-8", "UTF8");
1012             mime2java.put("utf8", "UTF8");
1013             mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
1014             mime2java.put("ja_jp.eucjp", "EUCJIS");
1015             mime2java.put ("euc-kr", "KSC5601");
1016             mime2java.put("euckr", "KSC5601");
1017             mime2java.put("us-ascii", "ISO-8859-1");
1018             mime2java.put("x-us-ascii", "ISO-8859-1");
1019         }
1020     }
1021 
1022 
1023     /**
1024      * Read a section of a character map table and populate the
1025      * target mapping table with the information.  The table end
1026      * is marked by a line starting with "--" and also ending with
1027      * "--".  Blank lines and comment lines (beginning with '#') are
1028      * ignored.
1029      *
1030      * @param reader The source of the file information.
1031      * @param table  The mapping table used to store the information.
1032      */
1033     static private void readMappings(BufferedReader reader, Map table) throws IOException {
1034         // process lines to the EOF or the end of table marker.
1035         while (true) {
1036             String line = reader.readLine();
1037             // no line returned is an EOF
1038             if (line == null) {
1039                 return;
1040             }
1041 
1042             // trim so we're not messed up by trailing blanks
1043             line = line.trim();
1044 
1045             if (line.length() == 0 || line.startsWith("#")) {
1046                 continue;
1047             }
1048 
1049             // stop processing if this is the end-of-table marker.
1050             if (line.startsWith("--") && line.endsWith("--")) {
1051                 return;
1052             }
1053 
1054             // we allow either blanks or tabs as token delimiters.
1055             StringTokenizer tokenizer = new StringTokenizer(line, " \t");
1056 
1057             try {
1058                 String from = tokenizer.nextToken().toLowerCase();
1059                 String to = tokenizer.nextToken();
1060 
1061                 table.put(from, to);
1062             } catch (NoSuchElementException e) {
1063                 // just ignore the line if invalid.
1064             }
1065         }
1066     }
1067 
1068 
1069     /**
1070      * Perform RFC 2047 text folding on a string of text.
1071      *
1072      * @param used   The amount of text already "used up" on this line.  This is
1073      *               typically the length of a message header that this text
1074      *               get getting added to.
1075      * @param s      The text to fold.
1076      *
1077      * @return The input text, with linebreaks inserted at appropriate fold points.
1078      */
1079     public static String fold(int used, String s) {
1080         // if folding is disable, unfolding is also.  Return the string unchanged.
1081         if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
1082             return s;
1083         }
1084 
1085         int end;
1086 
1087         // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs,
1088         // and line break characters.
1089         for (end = s.length() - 1; end >= 0; end--) {
1090             int ch = s.charAt(end);
1091             if (ch != ' ' && ch != '\t' ) {
1092                 break;
1093             }
1094         }
1095 
1096         // did we actually find something to remove?  Shorten the String to the trimmed length
1097         if (end != s.length() - 1) {
1098             s = s.substring(0, end + 1);
1099         }
1100 
1101         // does the string as it exists now not require folding?  We can just had that back right off.
1102         if (s.length() + used <= FOLD_THRESHOLD) {
1103             return s;
1104         }
1105 
1106         // get a buffer for the length of the string, plus room for a few line breaks.
1107         // these are soft line breaks, so we generally need more that just the line breaks (an escape +
1108         // CR + LF + leading space on next line);
1109         StringBuffer newString = new StringBuffer(s.length() + 8);
1110 
1111 
1112         // now keep chopping this down until we've accomplished what we need.
1113         while (used + s.length() > FOLD_THRESHOLD) {
1114             int breakPoint = -1;
1115             char breakChar = 0;
1116 
1117             // now scan for the next place where we can break.
1118             for (int i = 0; i < s.length(); i++) {
1119                 // have we passed the fold limit?
1120                 if (used + i > FOLD_THRESHOLD) {
1121                     // if we've already seen a blank, then stop now.  Otherwise
1122                     // we keep going until we hit a fold point.
1123                     if (breakPoint != -1) {
1124                         break;
1125                     }
1126                 }
1127                 char ch = s.charAt(i);
1128 
1129                 // a white space character?
1130                 if (ch == ' ' || ch == '\t') {
1131                     // this might be a run of white space, so skip over those now.
1132                     breakPoint = i;
1133                     // we need to maintain the same character type after the inserted linebreak.
1134                     breakChar = ch;
1135                     i++;
1136                     while (i < s.length()) {
1137                         ch = s.charAt(i);
1138                         if (ch != ' ' && ch != '\t') {
1139                             break;
1140                         }
1141                         i++;
1142                     }
1143                 }
1144                 // found an embedded new line.  Escape this so that the unfolding process preserves it.
1145                 else if (ch == '\n') {
1146                     newString.append('\\');
1147                     newString.append('\n');
1148                 }
1149                 else if (ch == '\r') {
1150                     newString.append('\\');
1151                     newString.append('\n');
1152                     i++;
1153                     // if this is a CRLF pair, add the second char also
1154                     if (i < s.length() && s.charAt(i) == '\n') {
1155                         newString.append('\r');
1156                     }
1157                 }
1158 
1159             }
1160             // no fold point found, we punt, append the remainder and leave.
1161             if (breakPoint == -1) {
1162                 newString.append(s);
1163                 return newString.toString();
1164             }
1165             newString.append(s.substring(0, breakPoint));
1166             newString.append("\r\n");
1167             newString.append(breakChar);
1168             // chop the string
1169             s = s.substring(breakPoint + 1);
1170             // start again, and we've used the first char of the limit already with the whitespace char.
1171             used = 1;
1172         }
1173 
1174         // add on the remainder, and return
1175         newString.append(s);
1176         return newString.toString();
1177     }
1178 
1179     /**
1180      * Unfold a folded string.  The unfolding process will remove
1181      * any line breaks that are not escaped and which are also followed
1182      * by whitespace characters.
1183      *
1184      * @param s      The folded string.
1185      *
1186      * @return A new string with unfolding rules applied.
1187      */
1188     public static String unfold(String s) {
1189         // if folding is disable, unfolding is also.  Return the string unchanged.
1190         if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
1191             return s;
1192         }
1193 
1194         // if there are no line break characters in the string, we can just return this.
1195         if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) {
1196             return s;
1197         }
1198 
1199         // we need to scan and fix things up.
1200         int length = s.length();
1201 
1202         StringBuffer newString = new StringBuffer(length);
1203 
1204         // scan the entire string
1205         for (int i = 0; i < length; i++) {
1206             char ch = s.charAt(i);
1207 
1208             // we have a backslash.  In folded strings, escape characters are only processed as such if
1209             // they preceed line breaks.  Otherwise, we leave it be.
1210             if (ch == '\\') {
1211                 // escape at the very end?  Just add the character.
1212                 if (i == length - 1) {
1213                     newString.append(ch);
1214                 }
1215                 else {
1216                     int nextChar = s.charAt(i + 1);
1217 
1218                     // naked newline?  Add the new line to the buffer, and skip the escape char.
1219                     if (nextChar == '\n') {
1220                         newString.append('\n');
1221                         i++;
1222                     }
1223                     else if (nextChar == '\r') {
1224                         // just the CR left?  Add it, removing the escape.
1225                         if (i == length - 2 || s.charAt(i + 2) != '\r') {
1226                             newString.append('\r');
1227                             i++;
1228                         }
1229                         else {
1230                             // toss the escape, add both parts of the CRLF, and skip over two chars.
1231                             newString.append('\r');
1232                             newString.append('\n');
1233                             i += 2;
1234                         }
1235                     }
1236                     else {
1237                         // an escape for another purpose, just copy it over.
1238                         newString.append(ch);
1239                     }
1240                 }
1241             }
1242             // we have an unescaped line break
1243             else if (ch == '\n' || ch == '\r') {
1244                 // remember the position in case we need to backtrack.
1245                 int lineBreak = i;
1246                 boolean CRLF = false;
1247 
1248                 if (ch == '\r') {
1249                     // check to see if we need to step over this.
1250                     if (i < length - 1 && s.charAt(i + 1) == '\n') {
1251                         i++;
1252                         // flag the type so we know what we might need to preserve.
1253                         CRLF = true;
1254                     }
1255                 }
1256 
1257                 // get a temp position scanner.
1258                 int scan = i + 1;
1259 
1260                 // does a blank follow this new line?  we need to scrap the new line and reduce the leading blanks
1261                 // down to a single blank.
1262                 if (scan < length && s.charAt(scan) == ' ') {
1263                     // add the character
1264                     newString.append(' ');
1265 
1266                     // scan over the rest of the blanks
1267                     i = scan + 1;
1268                     while (i < length && s.charAt(i) == ' ') {
1269                         i++;
1270                     }
1271                     // we'll increment down below, so back up to the last blank as the current char.
1272                     i--;
1273                 }
1274                 else {
1275                     // we must keep this line break.  Append the appropriate style.
1276                     if (CRLF) {
1277                         newString.append("\r\n");
1278                     }
1279                     else {
1280                         newString.append(ch);
1281                     }
1282                 }
1283             }
1284             else {
1285                 // just a normal, ordinary character
1286                 newString.append(ch);
1287             }
1288         }
1289         return newString.toString();
1290     }
1291 }
1292 
1293 
1294 /**
1295  * Utility class for examining content information written out
1296  * by a DataHandler object.  This stream gathers statistics on
1297  * the stream so it can make transfer encoding determinations.
1298  */
1299 class ContentCheckingOutputStream extends OutputStream {
1300     private int asciiChars = 0;
1301     private int nonAsciiChars = 0;
1302     private boolean containsLongLines = false;
1303     private boolean containsMalformedEOL = false;
1304     private int previousChar = 0;
1305     private int span = 0;
1306 
1307     ContentCheckingOutputStream() {
1308     }
1309 
1310     public void write(byte[] data) throws IOException {
1311         write(data, 0, data.length);
1312     }
1313 
1314     public void write(byte[] data, int offset, int length) throws IOException {
1315         for (int i = 0; i < length; i++) {
1316             write(data[offset + i]);
1317         }
1318     }
1319 
1320     public void write(int ch) {
1321         // we found a linebreak.  Reset the line length counters on either one.  We don't
1322         // really need to validate here.
1323         if (ch == '\n' || ch == '\r') {
1324             // we found a newline, this is only valid if the previous char was the '\r'
1325             if (ch == '\n') {
1326                 // malformed linebreak?  force this to base64 encoding.
1327                 if (previousChar != '\r') {
1328                     containsMalformedEOL = true;
1329                 }
1330             }
1331             // hit a line end, reset our line length counter
1332             span = 0;
1333         }
1334         else {
1335             span++;
1336             // the text has long lines, we can't transfer this as unencoded text.
1337             if (span > 998) {
1338                 containsLongLines = true;
1339             }
1340 
1341             // non-ascii character, we have to transfer this in binary.
1342             if (!ASCIIUtil.isAscii(ch)) {
1343                 nonAsciiChars++;
1344             }
1345             else {
1346                 asciiChars++;
1347             }
1348         }
1349         previousChar = ch;
1350     }
1351 
1352 
1353     public String getBinaryTransferEncoding() {
1354         if (nonAsciiChars != 0 || containsLongLines || containsMalformedEOL) {
1355             return "base64";
1356         }
1357         else {
1358             return "7bit";
1359         }
1360     }
1361 
1362     public String getTextTransferEncoding() {
1363         // looking good so far, only valid chars here.
1364         if (nonAsciiChars == 0) {
1365             // does this contain long text lines?  We need to use a Q-P encoding which will
1366             // be only slightly longer, but handles folding the longer lines.
1367             if (containsLongLines) {
1368                 return "quoted-printable";
1369             }
1370             else {
1371                 // ideal!  Easiest one to handle.
1372                 return "7bit";
1373             }
1374         }
1375         else {
1376             // mostly characters requiring encoding?  Base64 is our best bet.
1377             if (nonAsciiChars > asciiChars) {
1378                 return "base64";
1379             }
1380             else {
1381                 // Q-P encoding will use fewer bytes than the full Base64.
1382                 return "quoted-printable";
1383             }
1384         }
1385     }
1386 }