001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019 020 package javax.mail.internet; 021 022 import java.io.BufferedInputStream; 023 import java.io.BufferedReader; 024 import java.io.ByteArrayInputStream; 025 import java.io.ByteArrayOutputStream; 026 import java.io.IOException; 027 import java.io.InputStream; 028 import java.io.InputStreamReader; 029 import java.io.OutputStream; 030 import java.io.UnsupportedEncodingException; 031 import java.util.HashMap; 032 import java.util.Map; 033 import java.util.NoSuchElementException; 034 import java.util.StringTokenizer; 035 036 import javax.activation.DataHandler; 037 import javax.activation.DataSource; 038 import javax.mail.MessagingException; 039 040 import org.apache.geronimo.mail.util.ASCIIUtil; 041 import org.apache.geronimo.mail.util.Base64; 042 import org.apache.geronimo.mail.util.Base64DecoderStream; 043 import org.apache.geronimo.mail.util.Base64Encoder; 044 import org.apache.geronimo.mail.util.Base64EncoderStream; 045 import org.apache.geronimo.mail.util.QuotedPrintableDecoderStream; 046 import org.apache.geronimo.mail.util.QuotedPrintableEncoderStream; 047 import org.apache.geronimo.mail.util.QuotedPrintableEncoder; 048 import org.apache.geronimo.mail.util.QuotedPrintable; 049 import org.apache.geronimo.mail.util.SessionUtil; 050 import org.apache.geronimo.mail.util.UUDecoderStream; 051 import org.apache.geronimo.mail.util.UUEncoderStream; 052 053 // encodings include "base64", "quoted-printable", "7bit", "8bit" and "binary". 054 // In addition, "uuencode" is also supported. The 055 056 /** 057 * @version $Rev: 627556 $ $Date: 2008-02-13 13:27:22 -0500 (Wed, 13 Feb 2008) $ 058 */ 059 public class MimeUtility { 060 061 private static final String MIME_FOLDENCODEDWORDS = "mail.mime.foldencodedwords"; 062 private static final String MIME_DECODE_TEXT_STRICT = "mail.mime.decodetext.strict"; 063 private static final String MIME_FOLDTEXT = "mail.mime.foldtext"; 064 private static final int FOLD_THRESHOLD = 76; 065 066 private MimeUtility() { 067 } 068 069 public static final int ALL = -1; 070 071 private static String defaultJavaCharset; 072 private static String escapedChars = "\"\\\r\n"; 073 private static String linearWhiteSpace = " \t\r\n"; 074 075 private static String QP_WORD_SPECIALS = "=_?\"#$%&'(),.:;<>@[\\]^`{|}~"; 076 private static String QP_TEXT_SPECIALS = "=_?"; 077 078 // the javamail spec includes the ability to map java encoding names to MIME-specified names. Normally, 079 // these values are loaded from a character mapping file. 080 private static Map java2mime; 081 private static Map mime2java; 082 083 static { 084 // we need to load the mapping tables used by javaCharset() and mimeCharset(). 085 loadCharacterSetMappings(); 086 } 087 088 public static InputStream decode(InputStream in, String encoding) throws MessagingException { 089 encoding = encoding.toLowerCase(); 090 091 // some encodies are just pass-throughs, with no real decoding. 092 if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) { 093 return in; 094 } 095 else if (encoding.equals("base64")) { 096 return new Base64DecoderStream(in); 097 } 098 // UUEncode is known by a couple historical extension names too. 099 else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) { 100 return new UUDecoderStream(in); 101 } 102 else if (encoding.equals("quoted-printable")) { 103 return new QuotedPrintableDecoderStream(in); 104 } 105 else { 106 throw new MessagingException("Unknown encoding " + encoding); 107 } 108 } 109 110 /** 111 * Decode a string of text obtained from a mail header into 112 * it's proper form. The text generally will consist of a 113 * string of tokens, some of which may be encoded using 114 * base64 encoding. 115 * 116 * @param text The text to decode. 117 * 118 * @return The decoded test string. 119 * @exception UnsupportedEncodingException 120 */ 121 public static String decodeText(String text) throws UnsupportedEncodingException { 122 // if the text contains any encoded tokens, those tokens will be marked with "=?". If the 123 // source string doesn't contain that sequent, no decoding is required. 124 if (text.indexOf("=?") < 0) { 125 return text; 126 } 127 128 // we have two sets of rules we can apply. 129 if (!SessionUtil.getBooleanProperty(MIME_DECODE_TEXT_STRICT, true)) { 130 return decodeTextNonStrict(text); 131 } 132 133 int offset = 0; 134 int endOffset = text.length(); 135 136 int startWhiteSpace = -1; 137 int endWhiteSpace = -1; 138 139 StringBuffer decodedText = new StringBuffer(text.length()); 140 141 boolean previousTokenEncoded = false; 142 143 while (offset < endOffset) { 144 char ch = text.charAt(offset); 145 146 // is this a whitespace character? 147 if (linearWhiteSpace.indexOf(ch) != -1) { 148 startWhiteSpace = offset; 149 while (offset < endOffset) { 150 // step over the white space characters. 151 ch = text.charAt(offset); 152 if (linearWhiteSpace.indexOf(ch) != -1) { 153 offset++; 154 } 155 else { 156 // record the location of the first non lwsp and drop down to process the 157 // token characters. 158 endWhiteSpace = offset; 159 break; 160 } 161 } 162 } 163 else { 164 // we have a word token. We need to scan over the word and then try to parse it. 165 int wordStart = offset; 166 167 while (offset < endOffset) { 168 // step over the white space characters. 169 ch = text.charAt(offset); 170 if (linearWhiteSpace.indexOf(ch) == -1) { 171 offset++; 172 } 173 else { 174 break; 175 } 176 177 //NB: Trailing whitespace on these header strings will just be discarded. 178 } 179 // pull out the word token. 180 String word = text.substring(wordStart, offset); 181 // is the token encoded? decode the word 182 if (word.startsWith("=?")) { 183 try { 184 // if this gives a parsing failure, treat it like a non-encoded word. 185 String decodedWord = decodeWord(word); 186 187 // are any whitespace characters significant? Append 'em if we've got 'em. 188 if (!previousTokenEncoded) { 189 if (startWhiteSpace != -1) { 190 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 191 startWhiteSpace = -1; 192 } 193 } 194 // this is definitely a decoded token. 195 previousTokenEncoded = true; 196 // and add this to the text. 197 decodedText.append(decodedWord); 198 // we continue parsing from here...we allow parsing errors to fall through 199 // and get handled as normal text. 200 continue; 201 202 } catch (ParseException e) { 203 } 204 } 205 // this is a normal token, so it doesn't matter what the previous token was. Add the white space 206 // if we have it. 207 if (startWhiteSpace != -1) { 208 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 209 startWhiteSpace = -1; 210 } 211 // this is not a decoded token. 212 previousTokenEncoded = false; 213 decodedText.append(word); 214 } 215 } 216 217 return decodedText.toString(); 218 } 219 220 221 /** 222 * Decode a string of text obtained from a mail header into 223 * it's proper form. The text generally will consist of a 224 * string of tokens, some of which may be encoded using 225 * base64 encoding. This is for non-strict decoded for mailers that 226 * violate the RFC 2047 restriction that decoded tokens must be delimited 227 * by linear white space. This will scan tokens looking for inner tokens 228 * enclosed in "=?" -- "?=" pairs. 229 * 230 * @param text The text to decode. 231 * 232 * @return The decoded test string. 233 * @exception UnsupportedEncodingException 234 */ 235 private static String decodeTextNonStrict(String text) throws UnsupportedEncodingException { 236 int offset = 0; 237 int endOffset = text.length(); 238 239 int startWhiteSpace = -1; 240 int endWhiteSpace = -1; 241 242 StringBuffer decodedText = new StringBuffer(text.length()); 243 244 boolean previousTokenEncoded = false; 245 246 while (offset < endOffset) { 247 char ch = text.charAt(offset); 248 249 // is this a whitespace character? 250 if (linearWhiteSpace.indexOf(ch) != -1) { 251 startWhiteSpace = offset; 252 while (offset < endOffset) { 253 // step over the white space characters. 254 ch = text.charAt(offset); 255 if (linearWhiteSpace.indexOf(ch) != -1) { 256 offset++; 257 } 258 else { 259 // record the location of the first non lwsp and drop down to process the 260 // token characters. 261 endWhiteSpace = offset; 262 break; 263 } 264 } 265 } 266 else { 267 // we're at the start of a word token. We potentially need to break this up into subtokens 268 int wordStart = offset; 269 270 while (offset < endOffset) { 271 // step over the white space characters. 272 ch = text.charAt(offset); 273 if (linearWhiteSpace.indexOf(ch) == -1) { 274 offset++; 275 } 276 else { 277 break; 278 } 279 280 //NB: Trailing whitespace on these header strings will just be discarded. 281 } 282 // pull out the word token. 283 String word = text.substring(wordStart, offset); 284 285 int decodeStart = 0; 286 287 // now scan and process each of the bits within here. 288 while (decodeStart < word.length()) { 289 int tokenStart = word.indexOf("=?", decodeStart); 290 if (tokenStart == -1) { 291 // this is a normal token, so it doesn't matter what the previous token was. Add the white space 292 // if we have it. 293 if (startWhiteSpace != -1) { 294 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 295 startWhiteSpace = -1; 296 } 297 // this is not a decoded token. 298 previousTokenEncoded = false; 299 decodedText.append(word.substring(decodeStart)); 300 // we're finished. 301 break; 302 } 303 // we have something to process 304 else { 305 // we might have a normal token preceeding this. 306 if (tokenStart != decodeStart) { 307 // this is a normal token, so it doesn't matter what the previous token was. Add the white space 308 // if we have it. 309 if (startWhiteSpace != -1) { 310 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 311 startWhiteSpace = -1; 312 } 313 // this is not a decoded token. 314 previousTokenEncoded = false; 315 decodedText.append(word.substring(decodeStart, tokenStart)); 316 } 317 318 // now find the end marker. 319 int tokenEnd = word.indexOf("?=", tokenStart); 320 // sigh, an invalid token. Treat this as plain text. 321 if (tokenEnd == -1) { 322 // this is a normal token, so it doesn't matter what the previous token was. Add the white space 323 // if we have it. 324 if (startWhiteSpace != -1) { 325 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 326 startWhiteSpace = -1; 327 } 328 // this is not a decoded token. 329 previousTokenEncoded = false; 330 decodedText.append(word.substring(tokenStart)); 331 // we're finished. 332 break; 333 } 334 else { 335 // update our ticker 336 decodeStart = tokenEnd + 2; 337 338 String token = word.substring(tokenStart, tokenEnd); 339 try { 340 // if this gives a parsing failure, treat it like a non-encoded word. 341 String decodedWord = decodeWord(token); 342 343 // are any whitespace characters significant? Append 'em if we've got 'em. 344 if (!previousTokenEncoded) { 345 if (startWhiteSpace != -1) { 346 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 347 startWhiteSpace = -1; 348 } 349 } 350 // this is definitely a decoded token. 351 previousTokenEncoded = true; 352 // and add this to the text. 353 decodedText.append(decodedWord); 354 // we continue parsing from here...we allow parsing errors to fall through 355 // and get handled as normal text. 356 continue; 357 358 } catch (ParseException e) { 359 } 360 // this is a normal token, so it doesn't matter what the previous token was. Add the white space 361 // if we have it. 362 if (startWhiteSpace != -1) { 363 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 364 startWhiteSpace = -1; 365 } 366 // this is not a decoded token. 367 previousTokenEncoded = false; 368 decodedText.append(token); 369 } 370 } 371 } 372 } 373 } 374 375 return decodedText.toString(); 376 } 377 378 /** 379 * Parse a string using the RFC 2047 rules for an "encoded-word" 380 * type. This encoding has the syntax: 381 * 382 * encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 383 * 384 * @param word The possibly encoded word value. 385 * 386 * @return The decoded word. 387 * @exception ParseException 388 * @exception UnsupportedEncodingException 389 */ 390 public static String decodeWord(String word) throws ParseException, UnsupportedEncodingException { 391 // encoded words start with the characters "=?". If this not an encoded word, we throw a 392 // ParseException for the caller. 393 394 if (!word.startsWith("=?")) { 395 throw new ParseException("Invalid RFC 2047 encoded-word: " + word); 396 } 397 398 int charsetPos = word.indexOf('?', 2); 399 if (charsetPos == -1) { 400 throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word); 401 } 402 403 // pull out the character set information (this is the MIME name at this point). 404 String charset = word.substring(2, charsetPos).toLowerCase(); 405 406 // now pull out the encoding token the same way. 407 int encodingPos = word.indexOf('?', charsetPos + 1); 408 if (encodingPos == -1) { 409 throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word); 410 } 411 412 String encoding = word.substring(charsetPos + 1, encodingPos); 413 414 // and finally the encoded text. 415 int encodedTextPos = word.indexOf("?=", encodingPos + 1); 416 if (encodedTextPos == -1) { 417 throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word); 418 } 419 420 String encodedText = word.substring(encodingPos + 1, encodedTextPos); 421 422 // seems a bit silly to encode a null string, but easy to deal with. 423 if (encodedText.length() == 0) { 424 return ""; 425 } 426 427 try { 428 // the decoder writes directly to an output stream. 429 ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length()); 430 431 byte[] encodedData = encodedText.getBytes("US-ASCII"); 432 433 // Base64 encoded? 434 if (encoding.equals("B")) { 435 Base64.decode(encodedData, out); 436 } 437 // maybe quoted printable. 438 else if (encoding.equals("Q")) { 439 QuotedPrintableEncoder dataEncoder = new QuotedPrintableEncoder(); 440 dataEncoder.decodeWord(encodedData, out); 441 } 442 else { 443 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding); 444 } 445 // get the decoded byte data and convert into a string. 446 byte[] decodedData = out.toByteArray(); 447 return new String(decodedData, javaCharset(charset)); 448 } catch (IOException e) { 449 throw new UnsupportedEncodingException("Invalid RFC 2047 encoding"); 450 } 451 452 } 453 454 /** 455 * Wrap an encoder around a given output stream. 456 * 457 * @param out The output stream to wrap. 458 * @param encoding The name of the encoding. 459 * 460 * @return A instance of FilterOutputStream that manages on the fly 461 * encoding for the requested encoding type. 462 * @exception MessagingException 463 */ 464 public static OutputStream encode(OutputStream out, String encoding) throws MessagingException { 465 // no encoding specified, so assume it goes out unchanged. 466 if (encoding == null) { 467 return out; 468 } 469 470 encoding = encoding.toLowerCase(); 471 472 // some encodies are just pass-throughs, with no real decoding. 473 if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) { 474 return out; 475 } 476 else if (encoding.equals("base64")) { 477 return new Base64EncoderStream(out); 478 } 479 // UUEncode is known by a couple historical extension names too. 480 else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) { 481 return new UUEncoderStream(out); 482 } 483 else if (encoding.equals("quoted-printable")) { 484 return new QuotedPrintableEncoderStream(out); 485 } 486 else { 487 throw new MessagingException("Unknown encoding " + encoding); 488 } 489 } 490 491 /** 492 * Wrap an encoder around a given output stream. 493 * 494 * @param out The output stream to wrap. 495 * @param encoding The name of the encoding. 496 * @param filename The filename of the data being sent (only used for UUEncode). 497 * 498 * @return A instance of FilterOutputStream that manages on the fly 499 * encoding for the requested encoding type. 500 * @exception MessagingException 501 */ 502 public static OutputStream encode(OutputStream out, String encoding, String filename) throws MessagingException { 503 encoding = encoding.toLowerCase(); 504 505 // some encodies are just pass-throughs, with no real decoding. 506 if (encoding.equals("binary") || encoding.equals("7bit") || encoding.equals("8bit")) { 507 return out; 508 } 509 else if (encoding.equals("base64")) { 510 return new Base64EncoderStream(out); 511 } 512 // UUEncode is known by a couple historical extension names too. 513 else if (encoding.equals("uuencode") || encoding.equals("x-uuencode") || encoding.equals("x-uue")) { 514 return new UUEncoderStream(out, filename); 515 } 516 else if (encoding.equals("quoted-printable")) { 517 return new QuotedPrintableEncoderStream(out); 518 } 519 else { 520 throw new MessagingException("Unknown encoding " + encoding); 521 } 522 } 523 524 525 public static String encodeText(String word) throws UnsupportedEncodingException { 526 return encodeText(word, null, null); 527 } 528 529 public static String encodeText(String word, String charset, String encoding) throws UnsupportedEncodingException { 530 return encodeWord(word, charset, encoding, false); 531 } 532 533 public static String encodeWord(String word) throws UnsupportedEncodingException { 534 return encodeWord(word, null, null); 535 } 536 537 public static String encodeWord(String word, String charset, String encoding) throws UnsupportedEncodingException { 538 return encodeWord(word, charset, encoding, true); 539 } 540 541 542 private static String encodeWord(String word, String charset, String encoding, boolean encodingWord) throws UnsupportedEncodingException { 543 544 // figure out what we need to encode this. 545 String encoder = ASCIIUtil.getTextTransferEncoding(word); 546 // all ascii? We can return this directly, 547 if (encoder.equals("7bit")) { 548 return word; 549 } 550 551 // if not given a charset, use the default. 552 if (charset == null) { 553 charset = getDefaultMIMECharset(); 554 } 555 556 // sort out the encoder. If not explicitly given, use the best guess we've already established. 557 if (encoding != null) { 558 if (encoding.equalsIgnoreCase("B")) { 559 encoder = "base64"; 560 } 561 else if (encoding.equalsIgnoreCase("Q")) { 562 encoder = "quoted-printable"; 563 } 564 else { 565 throw new UnsupportedEncodingException("Unknown transfer encoding: " + encoding); 566 } 567 } 568 569 try { 570 571 // we'll format this directly into the string buffer 572 StringBuffer result = new StringBuffer(); 573 574 // this is the maximum size of a segment of encoded data, which is based off 575 // of a 75 character size limit and all of the encoding overhead elements. 576 int sizeLimit = 75 - 7 - charset.length(); 577 578 // now do the appropriate encoding work 579 if (encoder.equals("base64")) { 580 Base64Encoder dataEncoder = new Base64Encoder(); 581 // this may recurse on the encoding if the string is too long. The left-most will not 582 // get a segment delimiter 583 encodeBase64(word, result, sizeLimit, charset, dataEncoder, true, SessionUtil.getBooleanProperty(MIME_FOLDENCODEDWORDS, false)); 584 } 585 else { 586 QuotedPrintableEncoder dataEncoder = new QuotedPrintableEncoder(); 587 encodeQuotedPrintable(word, result, sizeLimit, charset, dataEncoder, true, 588 SessionUtil.getBooleanProperty(MIME_FOLDENCODEDWORDS, false), encodingWord ? QP_WORD_SPECIALS : QP_TEXT_SPECIALS); 589 } 590 return result.toString(); 591 } catch (IOException e) { 592 throw new UnsupportedEncodingException("Invalid encoding"); 593 } 594 } 595 596 597 /** 598 * Encode a string into base64 encoding, taking into 599 * account the maximum segment length. 600 * 601 * @param data The string data to encode. 602 * @param out The output buffer used for the result. 603 * @param sizeLimit The maximum amount of encoded data we're allowed 604 * to have in a single encoded segment. 605 * @param charset The character set marker that needs to be added to the 606 * encoding header. 607 * @param encoder The encoder instance we're using. 608 * @param firstSegment 609 * If true, this is the first (left-most) segment in the 610 * data. Used to determine if segment delimiters need to 611 * be added between sections. 612 * @param foldSegments 613 * Indicates the type of delimiter to use (blank or newline sequence). 614 */ 615 static private void encodeBase64(String data, StringBuffer out, int sizeLimit, String charset, Base64Encoder encoder, boolean firstSegment, boolean foldSegments) throws IOException 616 { 617 // this needs to be converted into the appropriate transfer encoding. 618 byte [] bytes = data.getBytes(javaCharset(charset)); 619 620 int estimatedSize = encoder.estimateEncodedLength(bytes); 621 622 // if the estimated encoding size is over our segment limit, split the string in half and 623 // recurse. Eventually we'll reach a point where things are small enough. 624 if (estimatedSize > sizeLimit) { 625 // the first segment indicator travels with the left half. 626 encodeBase64(data.substring(0, data.length() / 2), out, sizeLimit, charset, encoder, firstSegment, foldSegments); 627 // the second half can never be the first segment 628 encodeBase64(data.substring(data.length() / 2), out, sizeLimit, charset, encoder, false, foldSegments); 629 } 630 else 631 { 632 // if this is not the first sement of the encoding, we need to add either a blank or 633 // a newline sequence to the data 634 if (!firstSegment) { 635 if (foldSegments) { 636 out.append("\r\n"); 637 } 638 else { 639 out.append(' '); 640 } 641 } 642 // do the encoding of the segment. 643 encoder.encodeWord(bytes, out, charset); 644 } 645 } 646 647 648 /** 649 * Encode a string into quoted printable encoding, taking into 650 * account the maximum segment length. 651 * 652 * @param data The string data to encode. 653 * @param out The output buffer used for the result. 654 * @param sizeLimit The maximum amount of encoded data we're allowed 655 * to have in a single encoded segment. 656 * @param charset The character set marker that needs to be added to the 657 * encoding header. 658 * @param encoder The encoder instance we're using. 659 * @param firstSegment 660 * If true, this is the first (left-most) segment in the 661 * data. Used to determine if segment delimiters need to 662 * be added between sections. 663 * @param foldSegments 664 * Indicates the type of delimiter to use (blank or newline sequence). 665 */ 666 static private void encodeQuotedPrintable(String data, StringBuffer out, int sizeLimit, String charset, QuotedPrintableEncoder encoder, 667 boolean firstSegment, boolean foldSegments, String specials) throws IOException 668 { 669 // this needs to be converted into the appropriate transfer encoding. 670 byte [] bytes = data.getBytes(javaCharset(charset)); 671 672 int estimatedSize = encoder.estimateEncodedLength(bytes, specials); 673 674 // if the estimated encoding size is over our segment limit, split the string in half and 675 // recurse. Eventually we'll reach a point where things are small enough. 676 if (estimatedSize > sizeLimit) { 677 // the first segment indicator travels with the left half. 678 encodeQuotedPrintable(data.substring(0, data.length() / 2), out, sizeLimit, charset, encoder, firstSegment, foldSegments, specials); 679 // the second half can never be the first segment 680 encodeQuotedPrintable(data.substring(data.length() / 2), out, sizeLimit, charset, encoder, false, foldSegments, specials); 681 } 682 else 683 { 684 // if this is not the first sement of the encoding, we need to add either a blank or 685 // a newline sequence to the data 686 if (!firstSegment) { 687 if (foldSegments) { 688 out.append("\r\n"); 689 } 690 else { 691 out.append(' '); 692 } 693 } 694 // do the encoding of the segment. 695 encoder.encodeWord(bytes, out, charset, specials); 696 } 697 } 698 699 700 /** 701 * Examine the content of a data source and decide what type 702 * of transfer encoding should be used. For text streams, 703 * we'll decided between 7bit, quoted-printable, and base64. 704 * For binary content types, we'll use either 7bit or base64. 705 * 706 * @param handler The DataHandler associated with the content. 707 * 708 * @return The string name of an encoding used to transfer the content. 709 */ 710 public static String getEncoding(DataHandler handler) { 711 712 713 // if this handler has an associated data source, we can read directly from the 714 // data source to make this judgment. This is generally MUCH faster than asking the 715 // DataHandler to write out the data for us. 716 DataSource ds = handler.getDataSource(); 717 if (ds != null) { 718 return getEncoding(ds); 719 } 720 721 try { 722 // get a parser that allows us to make comparisons. 723 ContentType content = new ContentType(ds.getContentType()); 724 725 // The only access to the content bytes at this point is by asking the handler to write 726 // the information out to a stream. We're going to pipe this through a special stream 727 // that examines the bytes as they go by. 728 ContentCheckingOutputStream checker = new ContentCheckingOutputStream(); 729 730 handler.writeTo(checker); 731 732 // figure this out based on whether we believe this to be a text type or not. 733 if (content.match("text/*")) { 734 return checker.getTextTransferEncoding(); 735 } 736 else { 737 return checker.getBinaryTransferEncoding(); 738 } 739 740 } catch (Exception e) { 741 // any unexpected I/O exceptions we'll force to a "safe" fallback position. 742 return "base64"; 743 } 744 } 745 746 747 /** 748 * Determine the what transfer encoding should be used for 749 * data retrieved from a DataSource. 750 * 751 * @param source The DataSource for the transmitted data. 752 * 753 * @return The string name of the encoding form that should be used for 754 * the data. 755 */ 756 public static String getEncoding(DataSource source) { 757 InputStream in = null; 758 759 try { 760 // get a parser that allows us to make comparisons. 761 ContentType content = new ContentType(source.getContentType()); 762 763 // we're probably going to have to scan the data. 764 in = source.getInputStream(); 765 766 if (!content.match("text/*")) { 767 // Not purporting to be a text type? Examine the content to see we might be able to 768 // at least pretend it is an ascii type. 769 return ASCIIUtil.getBinaryTransferEncoding(in); 770 } 771 else { 772 return ASCIIUtil.getTextTransferEncoding(in); 773 } 774 } catch (Exception e) { 775 // this was a problem...not sure what makes sense here, so we'll assume it's binary 776 // and we need to transfer this using Base64 encoding. 777 return "base64"; 778 } finally { 779 // make sure we close the stream 780 try { 781 if (in != null) { 782 in.close(); 783 } 784 } catch (IOException e) { 785 } 786 } 787 } 788 789 790 /** 791 * Quote a "word" value. If the word contains any character from 792 * the specified "specials" list, this value is returned as a 793 * quoted strong. Otherwise, it is returned unchanged (an "atom"). 794 * 795 * @param word The word requiring quoting. 796 * @param specials The set of special characters that can't appear in an unquoted 797 * string. 798 * 799 * @return The quoted value. This will be unchanged if the word doesn't contain 800 * any of the designated special characters. 801 */ 802 public static String quote(String word, String specials) { 803 int wordLength = word.length(); 804 boolean requiresQuoting = false; 805 // scan the string looking for problem characters 806 for (int i =0; i < wordLength; i++) { 807 char ch = word.charAt(i); 808 // special escaped characters require escaping, which also implies quoting. 809 if (escapedChars.indexOf(ch) >= 0) { 810 return quoteAndEscapeString(word); 811 } 812 // now check for control characters or the designated special characters. 813 if (ch < 32 || ch >= 127 || specials.indexOf(ch) >= 0) { 814 // we know this requires quoting, but we still need to scan the entire string to 815 // see if contains chars that require escaping. Just go ahead and treat it as if it does. 816 return quoteAndEscapeString(word); 817 } 818 } 819 return word; 820 } 821 822 /** 823 * Take a string and return it as a formatted quoted string, with 824 * all characters requiring escaping handled properly. 825 * 826 * @param word The string to quote. 827 * 828 * @return The quoted string. 829 */ 830 private static String quoteAndEscapeString(String word) { 831 int wordLength = word.length(); 832 // allocate at least enough for the string and two quotes plus a reasonable number of escaped chars. 833 StringBuffer buffer = new StringBuffer(wordLength + 10); 834 // add the leading quote. 835 buffer.append('"'); 836 837 for (int i = 0; i < wordLength; i++) { 838 char ch = word.charAt(i); 839 // is this an escaped char? 840 if (escapedChars.indexOf(ch) >= 0) { 841 // add the escape marker before appending. 842 buffer.append('\\'); 843 } 844 buffer.append(ch); 845 } 846 // now the closing quote 847 buffer.append('"'); 848 return buffer.toString(); 849 } 850 851 /** 852 * Translate a MIME standard character set name into the Java 853 * equivalent. 854 * 855 * @param charset The MIME standard name. 856 * 857 * @return The Java equivalent for this name. 858 */ 859 public static String javaCharset(String charset) { 860 // nothing in, nothing out. 861 if (charset == null) { 862 return null; 863 } 864 865 String mappedCharset = (String)mime2java.get(charset.toLowerCase()); 866 // if there is no mapping, then the original name is used. Many of the MIME character set 867 // names map directly back into Java. The reverse isn't necessarily true. 868 return mappedCharset == null ? charset : mappedCharset; 869 } 870 871 /** 872 * Map a Java character set name into the MIME equivalent. 873 * 874 * @param charset The java character set name. 875 * 876 * @return The MIME standard equivalent for this character set name. 877 */ 878 public static String mimeCharset(String charset) { 879 // nothing in, nothing out. 880 if (charset == null) { 881 return null; 882 } 883 884 String mappedCharset = (String)java2mime.get(charset.toLowerCase()); 885 // if there is no mapping, then the original name is used. Many of the MIME character set 886 // names map directly back into Java. The reverse isn't necessarily true. 887 return mappedCharset == null ? charset : mappedCharset; 888 } 889 890 891 /** 892 * Get the default character set to use, in Java name format. 893 * This either be the value set with the mail.mime.charset 894 * system property or obtained from the file.encoding system 895 * property. If neither of these is set, we fall back to 896 * 8859_1 (basically US-ASCII). 897 * 898 * @return The character string value of the default character set. 899 */ 900 public static String getDefaultJavaCharset() { 901 String charset = SessionUtil.getProperty("mail.mime.charset"); 902 if (charset != null) { 903 return javaCharset(charset); 904 } 905 return SessionUtil.getProperty("file.encoding", "8859_1"); 906 } 907 908 /** 909 * Get the default character set to use, in MIME name format. 910 * This either be the value set with the mail.mime.charset 911 * system property or obtained from the file.encoding system 912 * property. If neither of these is set, we fall back to 913 * 8859_1 (basically US-ASCII). 914 * 915 * @return The character string value of the default character set. 916 */ 917 static String getDefaultMIMECharset() { 918 // if the property is specified, this can be used directly. 919 String charset = SessionUtil.getProperty("mail.mime.charset"); 920 if (charset != null) { 921 return charset; 922 } 923 924 // get the Java-defined default and map back to a MIME name. 925 return mimeCharset(SessionUtil.getProperty("file.encoding", "8859_1")); 926 } 927 928 929 /** 930 * Load the default mapping tables used by the javaCharset() 931 * and mimeCharset() methods. By default, these tables are 932 * loaded from the /META-INF/javamail.charset.map file. If 933 * something goes wrong loading that file, we configure things 934 * with a default mapping table (which just happens to mimic 935 * what's in the default mapping file). 936 */ 937 static private void loadCharacterSetMappings() { 938 java2mime = new HashMap(); 939 mime2java = new HashMap(); 940 941 942 // normally, these come from a character map file contained in the jar file. 943 try { 944 InputStream map = javax.mail.internet.MimeUtility.class.getResourceAsStream("/META-INF/javamail.charset.map"); 945 946 if (map != null) { 947 // get a reader for this so we can load. 948 BufferedReader reader = new BufferedReader(new InputStreamReader(map)); 949 950 readMappings(reader, java2mime); 951 readMappings(reader, mime2java); 952 } 953 } catch (Exception e) { 954 } 955 956 // if any sort of error occurred reading the preferred file version, we could end up with empty 957 // mapping tables. This could cause all sorts of difficulty, so ensure they are populated with at 958 // least a reasonable set of defaults. 959 960 // these mappings echo what's in the default file. 961 if (java2mime.isEmpty()) { 962 java2mime.put("8859_1", "ISO-8859-1"); 963 java2mime.put("iso8859_1", "ISO-8859-1"); 964 java2mime.put("iso8859-1", "ISO-8859-1"); 965 966 java2mime.put("8859_2", "ISO-8859-2"); 967 java2mime.put("iso8859_2", "ISO-8859-2"); 968 java2mime.put("iso8859-2", "ISO-8859-2"); 969 970 java2mime.put("8859_3", "ISO-8859-3"); 971 java2mime.put("iso8859_3", "ISO-8859-3"); 972 java2mime.put("iso8859-3", "ISO-8859-3"); 973 974 java2mime.put("8859_4", "ISO-8859-4"); 975 java2mime.put("iso8859_4", "ISO-8859-4"); 976 java2mime.put("iso8859-4", "ISO-8859-4"); 977 978 java2mime.put("8859_5", "ISO-8859-5"); 979 java2mime.put("iso8859_5", "ISO-8859-5"); 980 java2mime.put("iso8859-5", "ISO-8859-5"); 981 982 java2mime.put ("8859_6", "ISO-8859-6"); 983 java2mime.put("iso8859_6", "ISO-8859-6"); 984 java2mime.put("iso8859-6", "ISO-8859-6"); 985 986 java2mime.put("8859_7", "ISO-8859-7"); 987 java2mime.put("iso8859_7", "ISO-8859-7"); 988 java2mime.put("iso8859-7", "ISO-8859-7"); 989 990 java2mime.put("8859_8", "ISO-8859-8"); 991 java2mime.put("iso8859_8", "ISO-8859-8"); 992 java2mime.put("iso8859-8", "ISO-8859-8"); 993 994 java2mime.put("8859_9", "ISO-8859-9"); 995 java2mime.put("iso8859_9", "ISO-8859-9"); 996 java2mime.put("iso8859-9", "ISO-8859-9"); 997 998 java2mime.put("sjis", "Shift_JIS"); 999 java2mime.put ("jis", "ISO-2022-JP"); 1000 java2mime.put("iso2022jp", "ISO-2022-JP"); 1001 java2mime.put("euc_jp", "euc-jp"); 1002 java2mime.put("koi8_r", "koi8-r"); 1003 java2mime.put("euc_cn", "euc-cn"); 1004 java2mime.put("euc_tw", "euc-tw"); 1005 java2mime.put("euc_kr", "euc-kr"); 1006 } 1007 1008 if (mime2java.isEmpty ()) { 1009 mime2java.put("iso-2022-cn", "ISO2022CN"); 1010 mime2java.put("iso-2022-kr", "ISO2022KR"); 1011 mime2java.put("utf-8", "UTF8"); 1012 mime2java.put("utf8", "UTF8"); 1013 mime2java.put("ja_jp.iso2022-7", "ISO2022JP"); 1014 mime2java.put("ja_jp.eucjp", "EUCJIS"); 1015 mime2java.put ("euc-kr", "KSC5601"); 1016 mime2java.put("euckr", "KSC5601"); 1017 mime2java.put("us-ascii", "ISO-8859-1"); 1018 mime2java.put("x-us-ascii", "ISO-8859-1"); 1019 } 1020 } 1021 1022 1023 /** 1024 * Read a section of a character map table and populate the 1025 * target mapping table with the information. The table end 1026 * is marked by a line starting with "--" and also ending with 1027 * "--". Blank lines and comment lines (beginning with '#') are 1028 * ignored. 1029 * 1030 * @param reader The source of the file information. 1031 * @param table The mapping table used to store the information. 1032 */ 1033 static private void readMappings(BufferedReader reader, Map table) throws IOException { 1034 // process lines to the EOF or the end of table marker. 1035 while (true) { 1036 String line = reader.readLine(); 1037 // no line returned is an EOF 1038 if (line == null) { 1039 return; 1040 } 1041 1042 // trim so we're not messed up by trailing blanks 1043 line = line.trim(); 1044 1045 if (line.length() == 0 || line.startsWith("#")) { 1046 continue; 1047 } 1048 1049 // stop processing if this is the end-of-table marker. 1050 if (line.startsWith("--") && line.endsWith("--")) { 1051 return; 1052 } 1053 1054 // we allow either blanks or tabs as token delimiters. 1055 StringTokenizer tokenizer = new StringTokenizer(line, " \t"); 1056 1057 try { 1058 String from = tokenizer.nextToken().toLowerCase(); 1059 String to = tokenizer.nextToken(); 1060 1061 table.put(from, to); 1062 } catch (NoSuchElementException e) { 1063 // just ignore the line if invalid. 1064 } 1065 } 1066 } 1067 1068 1069 /** 1070 * Perform RFC 2047 text folding on a string of text. 1071 * 1072 * @param used The amount of text already "used up" on this line. This is 1073 * typically the length of a message header that this text 1074 * get getting added to. 1075 * @param s The text to fold. 1076 * 1077 * @return The input text, with linebreaks inserted at appropriate fold points. 1078 */ 1079 public static String fold(int used, String s) { 1080 // if folding is disable, unfolding is also. Return the string unchanged. 1081 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) { 1082 return s; 1083 } 1084 1085 int end; 1086 1087 // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs, 1088 // and line break characters. 1089 for (end = s.length() - 1; end >= 0; end--) { 1090 int ch = s.charAt(end); 1091 if (ch != ' ' && ch != '\t' ) { 1092 break; 1093 } 1094 } 1095 1096 // did we actually find something to remove? Shorten the String to the trimmed length 1097 if (end != s.length() - 1) { 1098 s = s.substring(0, end + 1); 1099 } 1100 1101 // does the string as it exists now not require folding? We can just had that back right off. 1102 if (s.length() + used <= FOLD_THRESHOLD) { 1103 return s; 1104 } 1105 1106 // get a buffer for the length of the string, plus room for a few line breaks. 1107 // these are soft line breaks, so we generally need more that just the line breaks (an escape + 1108 // CR + LF + leading space on next line); 1109 StringBuffer newString = new StringBuffer(s.length() + 8); 1110 1111 1112 // now keep chopping this down until we've accomplished what we need. 1113 while (used + s.length() > FOLD_THRESHOLD) { 1114 int breakPoint = -1; 1115 char breakChar = 0; 1116 1117 // now scan for the next place where we can break. 1118 for (int i = 0; i < s.length(); i++) { 1119 // have we passed the fold limit? 1120 if (used + i > FOLD_THRESHOLD) { 1121 // if we've already seen a blank, then stop now. Otherwise 1122 // we keep going until we hit a fold point. 1123 if (breakPoint != -1) { 1124 break; 1125 } 1126 } 1127 char ch = s.charAt(i); 1128 1129 // a white space character? 1130 if (ch == ' ' || ch == '\t') { 1131 // this might be a run of white space, so skip over those now. 1132 breakPoint = i; 1133 // we need to maintain the same character type after the inserted linebreak. 1134 breakChar = ch; 1135 i++; 1136 while (i < s.length()) { 1137 ch = s.charAt(i); 1138 if (ch != ' ' && ch != '\t') { 1139 break; 1140 } 1141 i++; 1142 } 1143 } 1144 // found an embedded new line. Escape this so that the unfolding process preserves it. 1145 else if (ch == '\n') { 1146 newString.append('\\'); 1147 newString.append('\n'); 1148 } 1149 else if (ch == '\r') { 1150 newString.append('\\'); 1151 newString.append('\n'); 1152 i++; 1153 // if this is a CRLF pair, add the second char also 1154 if (i < s.length() && s.charAt(i) == '\n') { 1155 newString.append('\r'); 1156 } 1157 } 1158 1159 } 1160 // no fold point found, we punt, append the remainder and leave. 1161 if (breakPoint == -1) { 1162 newString.append(s); 1163 return newString.toString(); 1164 } 1165 newString.append(s.substring(0, breakPoint)); 1166 newString.append("\r\n"); 1167 newString.append(breakChar); 1168 // chop the string 1169 s = s.substring(breakPoint + 1); 1170 // start again, and we've used the first char of the limit already with the whitespace char. 1171 used = 1; 1172 } 1173 1174 // add on the remainder, and return 1175 newString.append(s); 1176 return newString.toString(); 1177 } 1178 1179 /** 1180 * Unfold a folded string. The unfolding process will remove 1181 * any line breaks that are not escaped and which are also followed 1182 * by whitespace characters. 1183 * 1184 * @param s The folded string. 1185 * 1186 * @return A new string with unfolding rules applied. 1187 */ 1188 public static String unfold(String s) { 1189 // if folding is disable, unfolding is also. Return the string unchanged. 1190 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) { 1191 return s; 1192 } 1193 1194 // if there are no line break characters in the string, we can just return this. 1195 if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) { 1196 return s; 1197 } 1198 1199 // we need to scan and fix things up. 1200 int length = s.length(); 1201 1202 StringBuffer newString = new StringBuffer(length); 1203 1204 // scan the entire string 1205 for (int i = 0; i < length; i++) { 1206 char ch = s.charAt(i); 1207 1208 // we have a backslash. In folded strings, escape characters are only processed as such if 1209 // they preceed line breaks. Otherwise, we leave it be. 1210 if (ch == '\\') { 1211 // escape at the very end? Just add the character. 1212 if (i == length - 1) { 1213 newString.append(ch); 1214 } 1215 else { 1216 int nextChar = s.charAt(i + 1); 1217 1218 // naked newline? Add the new line to the buffer, and skip the escape char. 1219 if (nextChar == '\n') { 1220 newString.append('\n'); 1221 i++; 1222 } 1223 else if (nextChar == '\r') { 1224 // just the CR left? Add it, removing the escape. 1225 if (i == length - 2 || s.charAt(i + 2) != '\r') { 1226 newString.append('\r'); 1227 i++; 1228 } 1229 else { 1230 // toss the escape, add both parts of the CRLF, and skip over two chars. 1231 newString.append('\r'); 1232 newString.append('\n'); 1233 i += 2; 1234 } 1235 } 1236 else { 1237 // an escape for another purpose, just copy it over. 1238 newString.append(ch); 1239 } 1240 } 1241 } 1242 // we have an unescaped line break 1243 else if (ch == '\n' || ch == '\r') { 1244 // remember the position in case we need to backtrack. 1245 int lineBreak = i; 1246 boolean CRLF = false; 1247 1248 if (ch == '\r') { 1249 // check to see if we need to step over this. 1250 if (i < length - 1 && s.charAt(i + 1) == '\n') { 1251 i++; 1252 // flag the type so we know what we might need to preserve. 1253 CRLF = true; 1254 } 1255 } 1256 1257 // get a temp position scanner. 1258 int scan = i + 1; 1259 1260 // does a blank follow this new line? we need to scrap the new line and reduce the leading blanks 1261 // down to a single blank. 1262 if (scan < length && s.charAt(scan) == ' ') { 1263 // add the character 1264 newString.append(' '); 1265 1266 // scan over the rest of the blanks 1267 i = scan + 1; 1268 while (i < length && s.charAt(i) == ' ') { 1269 i++; 1270 } 1271 // we'll increment down below, so back up to the last blank as the current char. 1272 i--; 1273 } 1274 else { 1275 // we must keep this line break. Append the appropriate style. 1276 if (CRLF) { 1277 newString.append("\r\n"); 1278 } 1279 else { 1280 newString.append(ch); 1281 } 1282 } 1283 } 1284 else { 1285 // just a normal, ordinary character 1286 newString.append(ch); 1287 } 1288 } 1289 return newString.toString(); 1290 } 1291 } 1292 1293 1294 /** 1295 * Utility class for examining content information written out 1296 * by a DataHandler object. This stream gathers statistics on 1297 * the stream so it can make transfer encoding determinations. 1298 */ 1299 class ContentCheckingOutputStream extends OutputStream { 1300 private int asciiChars = 0; 1301 private int nonAsciiChars = 0; 1302 private boolean containsLongLines = false; 1303 private boolean containsMalformedEOL = false; 1304 private int previousChar = 0; 1305 private int span = 0; 1306 1307 ContentCheckingOutputStream() { 1308 } 1309 1310 public void write(byte[] data) throws IOException { 1311 write(data, 0, data.length); 1312 } 1313 1314 public void write(byte[] data, int offset, int length) throws IOException { 1315 for (int i = 0; i < length; i++) { 1316 write(data[offset + i]); 1317 } 1318 } 1319 1320 public void write(int ch) { 1321 // we found a linebreak. Reset the line length counters on either one. We don't 1322 // really need to validate here. 1323 if (ch == '\n' || ch == '\r') { 1324 // we found a newline, this is only valid if the previous char was the '\r' 1325 if (ch == '\n') { 1326 // malformed linebreak? force this to base64 encoding. 1327 if (previousChar != '\r') { 1328 containsMalformedEOL = true; 1329 } 1330 } 1331 // hit a line end, reset our line length counter 1332 span = 0; 1333 } 1334 else { 1335 span++; 1336 // the text has long lines, we can't transfer this as unencoded text. 1337 if (span > 998) { 1338 containsLongLines = true; 1339 } 1340 1341 // non-ascii character, we have to transfer this in binary. 1342 if (!ASCIIUtil.isAscii(ch)) { 1343 nonAsciiChars++; 1344 } 1345 else { 1346 asciiChars++; 1347 } 1348 } 1349 previousChar = ch; 1350 } 1351 1352 1353 public String getBinaryTransferEncoding() { 1354 if (nonAsciiChars != 0 || containsLongLines || containsMalformedEOL) { 1355 return "base64"; 1356 } 1357 else { 1358 return "7bit"; 1359 } 1360 } 1361 1362 public String getTextTransferEncoding() { 1363 // looking good so far, only valid chars here. 1364 if (nonAsciiChars == 0) { 1365 // does this contain long text lines? We need to use a Q-P encoding which will 1366 // be only slightly longer, but handles folding the longer lines. 1367 if (containsLongLines) { 1368 return "quoted-printable"; 1369 } 1370 else { 1371 // ideal! Easiest one to handle. 1372 return "7bit"; 1373 } 1374 } 1375 else { 1376 // mostly characters requiring encoding? Base64 is our best bet. 1377 if (nonAsciiChars > asciiChars) { 1378 return "base64"; 1379 } 1380 else { 1381 // Q-P encoding will use fewer bytes than the full Base64. 1382 return "quoted-printable"; 1383 } 1384 } 1385 } 1386 }