001 /**
002 *
003 * Copyright 2003-2004 The Apache Software Foundation
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.geronimo.mail.util;
019
020 import java.io.BufferedInputStream;
021 import java.io.InputStream;
022 import java.io.IOException;
023
024
025 /**
026 * Set of utility classes for handling common encoding-related
027 * manipulations.
028 */
029 public class ASCIIUtil {
030 private static final String MIME_FOLDTEXT = "mail.mime.foldtext";
031 private static final int FOLD_THRESHOLD = 76;
032
033 /**
034 * Test to see if this string contains only US-ASCII (i.e., 7-bit
035 * ASCII) charactes.
036 *
037 * @param s The test string.
038 *
039 * @return true if this is a valid 7-bit ASCII encoding, false if it
040 * contains any non-US ASCII characters.
041 */
042 static public boolean isAscii(String s) {
043 for (int i = 0; i < s.length(); i++) {
044 if (!isAscii(s.charAt(i))) {
045 return false;
046 }
047 }
048 return true;
049 }
050
051 /**
052 * Test to see if a given character can be considered "valid" ASCII.
053 * The excluded characters are the control characters less than
054 * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and
055 * tab characters ARE considered value (all less than 32).
056 *
057 * @param ch The test character.
058 *
059 * @return true if this character meets the "ascii-ness" criteria, false
060 * otherwise.
061 */
062 static public boolean isAscii(int ch) {
063 // these are explicitly considered valid.
064 if (ch == '\r' || ch == '\n' || ch == '\t') {
065 return true;
066 }
067
068 // anything else outside the range is just plain wrong.
069 if (ch >= 127 || ch < 32) {
070 return false;
071 }
072 return true;
073 }
074
075
076 /**
077 * Examine a stream of text and make a judgement on what encoding
078 * type should be used for the text. Ideally, we want to use 7bit
079 * encoding to determine this, but we may need to use either quoted-printable
080 * or base64. The choice is made on the ratio of 7-bit characters to non-7bit.
081 *
082 * @param content An input stream for the content we're examining.
083 *
084 * @exception IOException
085 */
086 public static String getTextTransferEncoding(InputStream content) throws IOException {
087
088 // for efficiency, we'll read in blocks.
089 BufferedInputStream in = new BufferedInputStream(content, 4096);
090
091 int span = 0; // span of characters without a line break.
092 boolean containsLongLines = false;
093 int asciiChars = 0;
094 int nonAsciiChars = 0;
095
096 while (true) {
097 int ch = in.read();
098 // if we hit an EOF here, go decide what type we've actually found.
099 if (ch == -1) {
100 break;
101 }
102
103 // we found a linebreak. Reset the line length counters on either one. We don't
104 // really need to validate here.
105 if (ch == '\n' || ch == '\r') {
106 // hit a line end, reset our line length counter
107 span = 0;
108 }
109 else {
110 span++;
111 // the text has long lines, we can't transfer this as unencoded text.
112 if (span > 998) {
113 containsLongLines = true;
114 }
115
116 // non-ascii character, we have to transfer this in binary.
117 if (!isAscii(ch)) {
118 nonAsciiChars++;
119 }
120 else {
121 asciiChars++;
122 }
123 }
124 }
125
126 // looking good so far, only valid chars here.
127 if (nonAsciiChars == 0) {
128 // does this contain long text lines? We need to use a Q-P encoding which will
129 // be only slightly longer, but handles folding the longer lines.
130 if (containsLongLines) {
131 return "quoted-printable";
132 }
133 else {
134 // ideal! Easiest one to handle.
135 return "7bit";
136 }
137 }
138 else {
139 // mostly characters requiring encoding? Base64 is our best bet.
140 if (nonAsciiChars > asciiChars) {
141 return "base64";
142 }
143 else {
144 // Q-P encoding will use fewer bytes than the full Base64.
145 return "quoted-printable";
146 }
147 }
148 }
149
150
151 /**
152 * Examine a stream of text and make a judgement on what encoding
153 * type should be used for the text. Ideally, we want to use 7bit
154 * encoding to determine this, but we may need to use either quoted-printable
155 * or base64. The choice is made on the ratio of 7-bit characters to non-7bit.
156 *
157 * @param content A string for the content we're examining.
158 */
159 public static String getTextTransferEncoding(String content) {
160
161 int asciiChars = 0;
162 int nonAsciiChars = 0;
163
164 for (int i = 0; i < content.length(); i++) {
165 int ch = content.charAt(i);
166
167 // non-ascii character, we have to transfer this in binary.
168 if (!isAscii(ch)) {
169 nonAsciiChars++;
170 }
171 else {
172 asciiChars++;
173 }
174 }
175
176 // looking good so far, only valid chars here.
177 if (nonAsciiChars == 0) {
178 // ideal! Easiest one to handle.
179 return "7bit";
180 }
181 else {
182 // mostly characters requiring encoding? Base64 is our best bet.
183 if (nonAsciiChars > asciiChars) {
184 return "base64";
185 }
186 else {
187 // Q-P encoding will use fewer bytes than the full Base64.
188 return "quoted-printable";
189 }
190 }
191 }
192
193
194 /**
195 * Determine if the transfer encoding looks like it might be
196 * valid ascii text, and thus transferable as 7bit code. In
197 * order for this to be true, all characters must be valid
198 * 7-bit ASCII code AND all line breaks must be properly formed
199 * (JUST '\r\n' sequences). 7-bit transfers also
200 * typically have a line limit of 1000 bytes (998 + the CRLF), so any
201 * stretch of charactes longer than that will also force Base64 encoding.
202 *
203 * @param content An input stream for the content we're examining.
204 *
205 * @exception IOException
206 */
207 public static String getBinaryTransferEncoding(InputStream content) throws IOException {
208
209 // for efficiency, we'll read in blocks.
210 BufferedInputStream in = new BufferedInputStream(content, 4096);
211
212 int previousChar = 0;
213 int span = 0; // span of characters without a line break.
214
215 while (true) {
216 int ch = in.read();
217 // if we hit an EOF here, we've only found valid text so far, so we can transfer this as
218 // 7-bit ascii.
219 if (ch == -1) {
220 return "7bit";
221 }
222
223 // we found a newline, this is only valid if the previous char was the '\r'
224 if (ch == '\n') {
225 // malformed linebreak? force this to base64 encoding.
226 if (previousChar != '\r') {
227 return "base64";
228 }
229 // hit a line end, reset our line length counter
230 span = 0;
231 }
232 else {
233 span++;
234 // the text has long lines, we can't transfer this as unencoded text.
235 if (span > 998) {
236 return "base64";
237 }
238
239 // non-ascii character, we have to transfer this in binary.
240 if (!isAscii(ch)) {
241 return "base64";
242 }
243 }
244 previousChar = ch;
245 }
246 }
247
248
249 /**
250 * Perform RFC 2047 text folding on a string of text.
251 *
252 * @param used The amount of text already "used up" on this line. This is
253 * typically the length of a message header that this text
254 * get getting added to.
255 * @param s The text to fold.
256 *
257 * @return The input text, with linebreaks inserted at appropriate fold points.
258 */
259 public static String fold(int used, String s) {
260 // if folding is disable, unfolding is also. Return the string unchanged.
261 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
262 return s;
263 }
264
265 int end;
266
267 // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs,
268 // and line break characters.
269 for (end = s.length() - 1; end >= 0; end--) {
270 int ch = s.charAt(end);
271 if (ch != ' ' && ch != '\t' ) {
272 break;
273 }
274 }
275
276 // did we actually find something to remove? Shorten the String to the trimmed length
277 if (end != s.length() - 1) {
278 s = s.substring(0, end + 1);
279 }
280
281 // does the string as it exists now not require folding? We can just had that back right off.
282 if (s.length() + used <= FOLD_THRESHOLD) {
283 return s;
284 }
285
286 // get a buffer for the length of the string, plus room for a few line breaks.
287 // these are soft line breaks, so we generally need more that just the line breaks (an escape +
288 // CR + LF + leading space on next line);
289 StringBuffer newString = new StringBuffer(s.length() + 8);
290
291
292 // now keep chopping this down until we've accomplished what we need.
293 while (used + s.length() > FOLD_THRESHOLD) {
294 int breakPoint = -1;
295 char breakChar = 0;
296
297 // now scan for the next place where we can break.
298 for (int i = 0; i < s.length(); i++) {
299 // have we passed the fold limit?
300 if (used + i > FOLD_THRESHOLD) {
301 // if we've already seen a blank, then stop now. Otherwise
302 // we keep going until we hit a fold point.
303 if (breakPoint != -1) {
304 break;
305 }
306 }
307 char ch = s.charAt(i);
308
309 // a white space character?
310 if (ch == ' ' || ch == '\t') {
311 // this might be a run of white space, so skip over those now.
312 breakPoint = i;
313 // we need to maintain the same character type after the inserted linebreak.
314 breakChar = ch;
315 i++;
316 while (i < s.length()) {
317 ch = s.charAt(i);
318 if (ch != ' ' && ch != '\t') {
319 break;
320 }
321 i++;
322 }
323 }
324 // found an embedded new line. Escape this so that the unfolding process preserves it.
325 else if (ch == '\n') {
326 newString.append('\\');
327 newString.append('\n');
328 }
329 else if (ch == '\r') {
330 newString.append('\\');
331 newString.append('\n');
332 i++;
333 // if this is a CRLF pair, add the second char also
334 if (i < s.length() && s.charAt(i) == '\n') {
335 newString.append('\r');
336 }
337 }
338
339 }
340 // no fold point found, we punt, append the remainder and leave.
341 if (breakPoint == -1) {
342 newString.append(s);
343 return newString.toString();
344 }
345 newString.append(s.substring(0, breakPoint));
346 newString.append("\r\n");
347 newString.append(breakChar);
348 // chop the string
349 s = s.substring(breakPoint + 1);
350 // start again, and we've used the first char of the limit already with the whitespace char.
351 used = 1;
352 }
353
354 // add on the remainder, and return
355 newString.append(s);
356 return newString.toString();
357 }
358
359 /**
360 * Unfold a folded string. The unfolding process will remove
361 * any line breaks that are not escaped and which are also followed
362 * by whitespace characters.
363 *
364 * @param s The folded string.
365 *
366 * @return A new string with unfolding rules applied.
367 */
368 public static String unfold(String s) {
369 // if folding is disable, unfolding is also. Return the string unchanged.
370 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
371 return s;
372 }
373
374 // if there are no line break characters in the string, we can just return this.
375 if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) {
376 return s;
377 }
378
379 // we need to scan and fix things up.
380 int length = s.length();
381
382 StringBuffer newString = new StringBuffer(length);
383
384 // scan the entire string
385 for (int i = 0; i < length; i++) {
386 int ch = s.charAt(i);
387
388 // we have a backslash. In folded strings, escape characters are only processed as such if
389 // they preceed line breaks. Otherwise, we leave it be.
390 if (ch == '\\') {
391 // escape at the very end? Just add the character.
392 if (i == length - 1) {
393 newString.append(ch);
394 }
395 else {
396 int nextChar = s.charAt(i + 1);
397
398 // naked newline? Add the new line to the buffer, and skip the escape char.
399 if (nextChar == '\n') {
400 newString.append('\n');
401 i++;
402 }
403 else if (nextChar == '\r') {
404 // just the CR left? Add it, removing the escape.
405 if (i == length - 2 || s.charAt(i + 2) != '\r') {
406 newString.append('\r');
407 i++;
408 }
409 else {
410 // toss the escape, add both parts of the CRLF, and skip over two chars.
411 newString.append('\r');
412 newString.append('\n');
413 i += 2;
414 }
415 }
416 else {
417 // an escape for another purpose, just copy it over.
418 newString.append(ch);
419 }
420 }
421 }
422 // we have an unescaped line break
423 else if (ch == '\n' || ch == '\r') {
424 // remember the position in case we need to backtrack.
425 int lineBreak = i;
426 boolean CRLF = false;
427
428 if (ch == '\r') {
429 // check to see if we need to step over this.
430 if (i < length - 1 && s.charAt(i + 1) == '\n') {
431 i++;
432 // flag the type so we know what we might need to preserve.
433 CRLF = true;
434 }
435 }
436
437 // get a temp position scanner.
438 int scan = i + 1;
439
440 // does a blank follow this new line? we need to scrap the new line and reduce the leading blanks
441 // down to a single blank.
442 if (scan < length && s.charAt(scan) == ' ') {
443 // add the character
444 newString.append(' ');
445
446 // scan over the rest of the blanks
447 i = scan + 1;
448 while (i < length && s.charAt(i) == ' ') {
449 i++;
450 }
451 // we'll increment down below, so back up to the last blank as the current char.
452 i--;
453 }
454 else {
455 // we must keep this line break. Append the appropriate style.
456 if (CRLF) {
457 newString.append("\r\n");
458 }
459 else {
460 newString.append(ch);
461 }
462 }
463 }
464 else {
465 // just a normal, ordinary character
466 newString.append(ch);
467 }
468 }
469 return newString.toString();
470 }
471 }