001 /** 002 * 003 * Copyright 2003-2004 The Apache Software Foundation 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.geronimo.mail.util; 019 020 import java.io.BufferedInputStream; 021 import java.io.InputStream; 022 import java.io.IOException; 023 024 025 /** 026 * Set of utility classes for handling common encoding-related 027 * manipulations. 028 */ 029 public class ASCIIUtil { 030 private static final String MIME_FOLDTEXT = "mail.mime.foldtext"; 031 private static final int FOLD_THRESHOLD = 76; 032 033 /** 034 * Test to see if this string contains only US-ASCII (i.e., 7-bit 035 * ASCII) charactes. 036 * 037 * @param s The test string. 038 * 039 * @return true if this is a valid 7-bit ASCII encoding, false if it 040 * contains any non-US ASCII characters. 041 */ 042 static public boolean isAscii(String s) { 043 for (int i = 0; i < s.length(); i++) { 044 if (!isAscii(s.charAt(i))) { 045 return false; 046 } 047 } 048 return true; 049 } 050 051 /** 052 * Test to see if a given character can be considered "valid" ASCII. 053 * The excluded characters are the control characters less than 054 * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and 055 * tab characters ARE considered value (all less than 32). 056 * 057 * @param ch The test character. 058 * 059 * @return true if this character meets the "ascii-ness" criteria, false 060 * otherwise. 061 */ 062 static public boolean isAscii(int ch) { 063 // these are explicitly considered valid. 064 if (ch == '\r' || ch == '\n' || ch == '\t') { 065 return true; 066 } 067 068 // anything else outside the range is just plain wrong. 069 if (ch >= 127 || ch < 32) { 070 return false; 071 } 072 return true; 073 } 074 075 076 /** 077 * Examine a stream of text and make a judgement on what encoding 078 * type should be used for the text. Ideally, we want to use 7bit 079 * encoding to determine this, but we may need to use either quoted-printable 080 * or base64. The choice is made on the ratio of 7-bit characters to non-7bit. 081 * 082 * @param content An input stream for the content we're examining. 083 * 084 * @exception IOException 085 */ 086 public static String getTextTransferEncoding(InputStream content) throws IOException { 087 088 // for efficiency, we'll read in blocks. 089 BufferedInputStream in = new BufferedInputStream(content, 4096); 090 091 int span = 0; // span of characters without a line break. 092 boolean containsLongLines = false; 093 int asciiChars = 0; 094 int nonAsciiChars = 0; 095 096 while (true) { 097 int ch = in.read(); 098 // if we hit an EOF here, go decide what type we've actually found. 099 if (ch == -1) { 100 break; 101 } 102 103 // we found a linebreak. Reset the line length counters on either one. We don't 104 // really need to validate here. 105 if (ch == '\n' || ch == '\r') { 106 // hit a line end, reset our line length counter 107 span = 0; 108 } 109 else { 110 span++; 111 // the text has long lines, we can't transfer this as unencoded text. 112 if (span > 998) { 113 containsLongLines = true; 114 } 115 116 // non-ascii character, we have to transfer this in binary. 117 if (!isAscii(ch)) { 118 nonAsciiChars++; 119 } 120 else { 121 asciiChars++; 122 } 123 } 124 } 125 126 // looking good so far, only valid chars here. 127 if (nonAsciiChars == 0) { 128 // does this contain long text lines? We need to use a Q-P encoding which will 129 // be only slightly longer, but handles folding the longer lines. 130 if (containsLongLines) { 131 return "quoted-printable"; 132 } 133 else { 134 // ideal! Easiest one to handle. 135 return "7bit"; 136 } 137 } 138 else { 139 // mostly characters requiring encoding? Base64 is our best bet. 140 if (nonAsciiChars > asciiChars) { 141 return "base64"; 142 } 143 else { 144 // Q-P encoding will use fewer bytes than the full Base64. 145 return "quoted-printable"; 146 } 147 } 148 } 149 150 151 /** 152 * Examine a stream of text and make a judgement on what encoding 153 * type should be used for the text. Ideally, we want to use 7bit 154 * encoding to determine this, but we may need to use either quoted-printable 155 * or base64. The choice is made on the ratio of 7-bit characters to non-7bit. 156 * 157 * @param content A string for the content we're examining. 158 */ 159 public static String getTextTransferEncoding(String content) { 160 161 int asciiChars = 0; 162 int nonAsciiChars = 0; 163 164 for (int i = 0; i < content.length(); i++) { 165 int ch = content.charAt(i); 166 167 // non-ascii character, we have to transfer this in binary. 168 if (!isAscii(ch)) { 169 nonAsciiChars++; 170 } 171 else { 172 asciiChars++; 173 } 174 } 175 176 // looking good so far, only valid chars here. 177 if (nonAsciiChars == 0) { 178 // ideal! Easiest one to handle. 179 return "7bit"; 180 } 181 else { 182 // mostly characters requiring encoding? Base64 is our best bet. 183 if (nonAsciiChars > asciiChars) { 184 return "base64"; 185 } 186 else { 187 // Q-P encoding will use fewer bytes than the full Base64. 188 return "quoted-printable"; 189 } 190 } 191 } 192 193 194 /** 195 * Determine if the transfer encoding looks like it might be 196 * valid ascii text, and thus transferable as 7bit code. In 197 * order for this to be true, all characters must be valid 198 * 7-bit ASCII code AND all line breaks must be properly formed 199 * (JUST '\r\n' sequences). 7-bit transfers also 200 * typically have a line limit of 1000 bytes (998 + the CRLF), so any 201 * stretch of charactes longer than that will also force Base64 encoding. 202 * 203 * @param content An input stream for the content we're examining. 204 * 205 * @exception IOException 206 */ 207 public static String getBinaryTransferEncoding(InputStream content) throws IOException { 208 209 // for efficiency, we'll read in blocks. 210 BufferedInputStream in = new BufferedInputStream(content, 4096); 211 212 int previousChar = 0; 213 int span = 0; // span of characters without a line break. 214 215 while (true) { 216 int ch = in.read(); 217 // if we hit an EOF here, we've only found valid text so far, so we can transfer this as 218 // 7-bit ascii. 219 if (ch == -1) { 220 return "7bit"; 221 } 222 223 // we found a newline, this is only valid if the previous char was the '\r' 224 if (ch == '\n') { 225 // malformed linebreak? force this to base64 encoding. 226 if (previousChar != '\r') { 227 return "base64"; 228 } 229 // hit a line end, reset our line length counter 230 span = 0; 231 } 232 else { 233 span++; 234 // the text has long lines, we can't transfer this as unencoded text. 235 if (span > 998) { 236 return "base64"; 237 } 238 239 // non-ascii character, we have to transfer this in binary. 240 if (!isAscii(ch)) { 241 return "base64"; 242 } 243 } 244 previousChar = ch; 245 } 246 } 247 248 249 /** 250 * Perform RFC 2047 text folding on a string of text. 251 * 252 * @param used The amount of text already "used up" on this line. This is 253 * typically the length of a message header that this text 254 * get getting added to. 255 * @param s The text to fold. 256 * 257 * @return The input text, with linebreaks inserted at appropriate fold points. 258 */ 259 public static String fold(int used, String s) { 260 // if folding is disable, unfolding is also. Return the string unchanged. 261 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) { 262 return s; 263 } 264 265 int end; 266 267 // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs, 268 // and line break characters. 269 for (end = s.length() - 1; end >= 0; end--) { 270 int ch = s.charAt(end); 271 if (ch != ' ' && ch != '\t' ) { 272 break; 273 } 274 } 275 276 // did we actually find something to remove? Shorten the String to the trimmed length 277 if (end != s.length() - 1) { 278 s = s.substring(0, end + 1); 279 } 280 281 // does the string as it exists now not require folding? We can just had that back right off. 282 if (s.length() + used <= FOLD_THRESHOLD) { 283 return s; 284 } 285 286 // get a buffer for the length of the string, plus room for a few line breaks. 287 // these are soft line breaks, so we generally need more that just the line breaks (an escape + 288 // CR + LF + leading space on next line); 289 StringBuffer newString = new StringBuffer(s.length() + 8); 290 291 292 // now keep chopping this down until we've accomplished what we need. 293 while (used + s.length() > FOLD_THRESHOLD) { 294 int breakPoint = -1; 295 char breakChar = 0; 296 297 // now scan for the next place where we can break. 298 for (int i = 0; i < s.length(); i++) { 299 // have we passed the fold limit? 300 if (used + i > FOLD_THRESHOLD) { 301 // if we've already seen a blank, then stop now. Otherwise 302 // we keep going until we hit a fold point. 303 if (breakPoint != -1) { 304 break; 305 } 306 } 307 char ch = s.charAt(i); 308 309 // a white space character? 310 if (ch == ' ' || ch == '\t') { 311 // this might be a run of white space, so skip over those now. 312 breakPoint = i; 313 // we need to maintain the same character type after the inserted linebreak. 314 breakChar = ch; 315 i++; 316 while (i < s.length()) { 317 ch = s.charAt(i); 318 if (ch != ' ' && ch != '\t') { 319 break; 320 } 321 i++; 322 } 323 } 324 // found an embedded new line. Escape this so that the unfolding process preserves it. 325 else if (ch == '\n') { 326 newString.append('\\'); 327 newString.append('\n'); 328 } 329 else if (ch == '\r') { 330 newString.append('\\'); 331 newString.append('\n'); 332 i++; 333 // if this is a CRLF pair, add the second char also 334 if (i < s.length() && s.charAt(i) == '\n') { 335 newString.append('\r'); 336 } 337 } 338 339 } 340 // no fold point found, we punt, append the remainder and leave. 341 if (breakPoint == -1) { 342 newString.append(s); 343 return newString.toString(); 344 } 345 newString.append(s.substring(0, breakPoint)); 346 newString.append("\r\n"); 347 newString.append(breakChar); 348 // chop the string 349 s = s.substring(breakPoint + 1); 350 // start again, and we've used the first char of the limit already with the whitespace char. 351 used = 1; 352 } 353 354 // add on the remainder, and return 355 newString.append(s); 356 return newString.toString(); 357 } 358 359 /** 360 * Unfold a folded string. The unfolding process will remove 361 * any line breaks that are not escaped and which are also followed 362 * by whitespace characters. 363 * 364 * @param s The folded string. 365 * 366 * @return A new string with unfolding rules applied. 367 */ 368 public static String unfold(String s) { 369 // if folding is disable, unfolding is also. Return the string unchanged. 370 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) { 371 return s; 372 } 373 374 // if there are no line break characters in the string, we can just return this. 375 if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) { 376 return s; 377 } 378 379 // we need to scan and fix things up. 380 int length = s.length(); 381 382 StringBuffer newString = new StringBuffer(length); 383 384 // scan the entire string 385 for (int i = 0; i < length; i++) { 386 int ch = s.charAt(i); 387 388 // we have a backslash. In folded strings, escape characters are only processed as such if 389 // they preceed line breaks. Otherwise, we leave it be. 390 if (ch == '\\') { 391 // escape at the very end? Just add the character. 392 if (i == length - 1) { 393 newString.append(ch); 394 } 395 else { 396 int nextChar = s.charAt(i + 1); 397 398 // naked newline? Add the new line to the buffer, and skip the escape char. 399 if (nextChar == '\n') { 400 newString.append('\n'); 401 i++; 402 } 403 else if (nextChar == '\r') { 404 // just the CR left? Add it, removing the escape. 405 if (i == length - 2 || s.charAt(i + 2) != '\r') { 406 newString.append('\r'); 407 i++; 408 } 409 else { 410 // toss the escape, add both parts of the CRLF, and skip over two chars. 411 newString.append('\r'); 412 newString.append('\n'); 413 i += 2; 414 } 415 } 416 else { 417 // an escape for another purpose, just copy it over. 418 newString.append(ch); 419 } 420 } 421 } 422 // we have an unescaped line break 423 else if (ch == '\n' || ch == '\r') { 424 // remember the position in case we need to backtrack. 425 int lineBreak = i; 426 boolean CRLF = false; 427 428 if (ch == '\r') { 429 // check to see if we need to step over this. 430 if (i < length - 1 && s.charAt(i + 1) == '\n') { 431 i++; 432 // flag the type so we know what we might need to preserve. 433 CRLF = true; 434 } 435 } 436 437 // get a temp position scanner. 438 int scan = i + 1; 439 440 // does a blank follow this new line? we need to scrap the new line and reduce the leading blanks 441 // down to a single blank. 442 if (scan < length && s.charAt(scan) == ' ') { 443 // add the character 444 newString.append(' '); 445 446 // scan over the rest of the blanks 447 i = scan + 1; 448 while (i < length && s.charAt(i) == ' ') { 449 i++; 450 } 451 // we'll increment down below, so back up to the last blank as the current char. 452 i--; 453 } 454 else { 455 // we must keep this line break. Append the appropriate style. 456 if (CRLF) { 457 newString.append("\r\n"); 458 } 459 else { 460 newString.append(ch); 461 } 462 } 463 } 464 else { 465 // just a normal, ordinary character 466 newString.append(ch); 467 } 468 } 469 return newString.toString(); 470 } 471 }