ASCIIUtil xref

View Javadoc

1   /**
2    *
3    * Copyright 2003-2004 The Apache Software Foundation
4    *
5    *  Licensed under the Apache License, Version 2.0 (the "License");
6    *  you may not use this file except in compliance with the License.
7    *  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  
18  package org.apache.geronimo.mail.util;
19  
20  import java.io.BufferedInputStream;
21  import java.io.InputStream;
22  import java.io.IOException;
23  
24  
25  /**
26   * Set of utility classes for handling common encoding-related
27   * manipulations.
28   */
29  public class ASCIIUtil {
30      private static final String MIME_FOLDTEXT = "mail.mime.foldtext";
31      private static final int FOLD_THRESHOLD = 76;
32  
33      /**
34       * Test to see if this string contains only US-ASCII (i.e., 7-bit
35       * ASCII) charactes.
36       *
37       * @param s      The test string.
38       *
39       * @return true if this is a valid 7-bit ASCII encoding, false if it
40       *         contains any non-US ASCII characters.
41       */
42      static public boolean isAscii(String s) {
43          for (int i = 0; i < s.length(); i++) {
44              if (!isAscii(s.charAt(i))) {
45                  return false;
46              }
47          }
48          return true;
49      }
50  
51      /**
52       * Test to see if a given character can be considered "valid" ASCII.
53       * The excluded characters are the control characters less than
54       * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and
55       * tab characters ARE considered value (all less than 32).
56       *
57       * @param ch     The test character.
58       *
59       * @return true if this character meets the "ascii-ness" criteria, false
60       *         otherwise.
61       */
62      static public boolean isAscii(int ch) {
63          // these are explicitly considered valid.
64          if (ch == '\r' || ch == '\n' || ch == '\t') {
65              return true;
66          }
67  
68          // anything else outside the range is just plain wrong.
69          if (ch >= 127 || ch < 32) {
70              return false;
71          }
72          return true;
73      }
74  
75  
76      /**
77       * Examine a stream of text and make a judgement on what encoding
78       * type should be used for the text.  Ideally, we want to use 7bit
79       * encoding to determine this, but we may need to use either quoted-printable
80       * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
81       *
82       * @param content     An input stream for the content we're examining.
83       *
84       * @exception IOException
85       */
86      public static String getTextTransferEncoding(InputStream content) throws IOException {
87  
88          // for efficiency, we'll read in blocks.
89          BufferedInputStream in = new BufferedInputStream(content, 4096);
90  
91          int span = 0;            // span of characters without a line break.
92          boolean containsLongLines = false;
93          int asciiChars = 0;
94          int nonAsciiChars = 0;
95  
96          while (true) {
97              int ch = in.read();
98              // if we hit an EOF here, go decide what type we've actually found.
99              if (ch == -1) {
100                 break;
101             }
102 
103             // we found a linebreak.  Reset the line length counters on either one.  We don't
104             // really need to validate here.
105             if (ch == '\n' || ch == '\r') {
106                 // hit a line end, reset our line length counter
107                 span = 0;
108             }
109             else {
110                 span++;
111                 // the text has long lines, we can't transfer this as unencoded text.
112                 if (span > 998) {
113                     containsLongLines = true;
114                 }
115 
116                 // non-ascii character, we have to transfer this in binary.
117                 if (!isAscii(ch)) {
118                     nonAsciiChars++;
119                 }
120                 else {
121                     asciiChars++;
122                 }
123             }
124         }
125 
126         // looking good so far, only valid chars here.
127         if (nonAsciiChars == 0) {
128             // does this contain long text lines?  We need to use a Q-P encoding which will
129             // be only slightly longer, but handles folding the longer lines.
130             if (containsLongLines) {
131                 return "quoted-printable";
132             }
133             else {
134                 // ideal!  Easiest one to handle.
135                 return "7bit";
136             }
137         }
138         else {
139             // mostly characters requiring encoding?  Base64 is our best bet.
140             if (nonAsciiChars > asciiChars) {
141                 return "base64";
142             }
143             else {
144                 // Q-P encoding will use fewer bytes than the full Base64.
145                 return "quoted-printable";
146             }
147         }
148     }
149 
150 
151     /**
152      * Examine a stream of text and make a judgement on what encoding
153      * type should be used for the text.  Ideally, we want to use 7bit
154      * encoding to determine this, but we may need to use either quoted-printable
155      * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
156      *
157      * @param content     A string for the content we're examining.
158      */
159     public static String getTextTransferEncoding(String content) {
160 
161         int asciiChars = 0;
162         int nonAsciiChars = 0;
163 
164         for (int i = 0; i < content.length(); i++) {
165             int ch = content.charAt(i);
166 
167             // non-ascii character, we have to transfer this in binary.
168             if (!isAscii(ch)) {
169                 nonAsciiChars++;
170             }
171             else {
172                 asciiChars++;
173             }
174         }
175 
176         // looking good so far, only valid chars here.
177         if (nonAsciiChars == 0) {
178             // ideal!  Easiest one to handle.
179             return "7bit";
180         }
181         else {
182             // mostly characters requiring encoding?  Base64 is our best bet.
183             if (nonAsciiChars > asciiChars) {
184                 return "base64";
185             }
186             else {
187                 // Q-P encoding will use fewer bytes than the full Base64.
188                 return "quoted-printable";
189             }
190         }
191     }
192 
193 
194     /**
195      * Determine if the transfer encoding looks like it might be
196      * valid ascii text, and thus transferable as 7bit code.  In
197      * order for this to be true, all characters must be valid
198      * 7-bit ASCII code AND all line breaks must be properly formed
199      * (JUST '\r\n' sequences).  7-bit transfers also
200      * typically have a line limit of 1000 bytes (998 + the CRLF), so any
201      * stretch of charactes longer than that will also force Base64 encoding.
202      *
203      * @param content     An input stream for the content we're examining.
204      *
205      * @exception IOException
206      */
207     public static String getBinaryTransferEncoding(InputStream content) throws IOException {
208 
209         // for efficiency, we'll read in blocks.
210         BufferedInputStream in = new BufferedInputStream(content, 4096);
211 
212         int previousChar = 0;
213         int span = 0;            // span of characters without a line break.
214 
215         while (true) {
216             int ch = in.read();
217             // if we hit an EOF here, we've only found valid text so far, so we can transfer this as
218             // 7-bit ascii.
219             if (ch == -1) {
220                 return "7bit";
221             }
222 
223             // we found a newline, this is only valid if the previous char was the '\r'
224             if (ch == '\n') {
225                 // malformed linebreak?  force this to base64 encoding.
226                 if (previousChar != '\r') {
227                     return "base64";
228                 }
229                 // hit a line end, reset our line length counter
230                 span = 0;
231             }
232             else {
233                 span++;
234                 // the text has long lines, we can't transfer this as unencoded text.
235                 if (span > 998) {
236                     return "base64";
237                 }
238 
239                 // non-ascii character, we have to transfer this in binary.
240                 if (!isAscii(ch)) {
241                     return "base64";
242                 }
243             }
244             previousChar = ch;
245         }
246     }
247 
248 
249     /**
250      * Perform RFC 2047 text folding on a string of text.
251      *
252      * @param used   The amount of text already "used up" on this line.  This is
253      *               typically the length of a message header that this text
254      *               get getting added to.
255      * @param s      The text to fold.
256      *
257      * @return The input text, with linebreaks inserted at appropriate fold points.
258      */
259     public static String fold(int used, String s) {
260         // if folding is disable, unfolding is also.  Return the string unchanged.
261         if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
262             return s;
263         }
264 
265         int end;
266 
267         // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs,
268         // and line break characters.
269         for (end = s.length() - 1; end >= 0; end--) {
270             int ch = s.charAt(end);
271             if (ch != ' ' && ch != '\t' ) {
272                 break;
273             }
274         }
275 
276         // did we actually find something to remove?  Shorten the String to the trimmed length
277         if (end != s.length() - 1) {
278             s = s.substring(0, end + 1);
279         }
280 
281         // does the string as it exists now not require folding?  We can just had that back right off.
282         if (s.length() + used <= FOLD_THRESHOLD) {
283             return s;
284         }
285 
286         // get a buffer for the length of the string, plus room for a few line breaks.
287         // these are soft line breaks, so we generally need more that just the line breaks (an escape +
288         // CR + LF + leading space on next line);
289         StringBuffer newString = new StringBuffer(s.length() + 8);
290 
291 
292         // now keep chopping this down until we've accomplished what we need.
293         while (used + s.length() > FOLD_THRESHOLD) {
294             int breakPoint = -1;
295             char breakChar = 0;
296 
297             // now scan for the next place where we can break.
298             for (int i = 0; i < s.length(); i++) {
299                 // have we passed the fold limit?
300                 if (used + i > FOLD_THRESHOLD) {
301                     // if we've already seen a blank, then stop now.  Otherwise
302                     // we keep going until we hit a fold point.
303                     if (breakPoint != -1) {
304                         break;
305                     }
306                 }
307                 char ch = s.charAt(i);
308 
309                 // a white space character?
310                 if (ch == ' ' || ch == '\t') {
311                     // this might be a run of white space, so skip over those now.
312                     breakPoint = i;
313                     // we need to maintain the same character type after the inserted linebreak.
314                     breakChar = ch;
315                     i++;
316                     while (i < s.length()) {
317                         ch = s.charAt(i);
318                         if (ch != ' ' && ch != '\t') {
319                             break;
320                         }
321                         i++;
322                     }
323                 }
324                 // found an embedded new line.  Escape this so that the unfolding process preserves it.
325                 else if (ch == '\n') {
326                     newString.append('\\');
327                     newString.append('\n');
328                 }
329                 else if (ch == '\r') {
330                     newString.append('\\');
331                     newString.append('\n');
332                     i++;
333                     // if this is a CRLF pair, add the second char also
334                     if (i < s.length() && s.charAt(i) == '\n') {
335                         newString.append('\r');
336                     }
337                 }
338 
339             }
340             // no fold point found, we punt, append the remainder and leave.
341             if (breakPoint == -1) {
342                 newString.append(s);
343                 return newString.toString();
344             }
345             newString.append(s.substring(0, breakPoint));
346             newString.append("\r\n");
347             newString.append(breakChar);
348             // chop the string
349             s = s.substring(breakPoint + 1);
350             // start again, and we've used the first char of the limit already with the whitespace char.
351             used = 1;
352         }
353 
354         // add on the remainder, and return
355         newString.append(s);
356         return newString.toString();
357     }
358 
359     /**
360      * Unfold a folded string.  The unfolding process will remove
361      * any line breaks that are not escaped and which are also followed
362      * by whitespace characters.
363      *
364      * @param s      The folded string.
365      *
366      * @return A new string with unfolding rules applied.
367      */
368     public static String unfold(String s) {
369         // if folding is disable, unfolding is also.  Return the string unchanged.
370         if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
371             return s;
372         }
373 
374         // if there are no line break characters in the string, we can just return this.
375         if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) {
376             return s;
377         }
378 
379         // we need to scan and fix things up.
380         int length = s.length();
381 
382         StringBuffer newString = new StringBuffer(length);
383 
384         // scan the entire string
385         for (int i = 0; i < length; i++) {
386             int ch = s.charAt(i);
387 
388             // we have a backslash.  In folded strings, escape characters are only processed as such if
389             // they preceed line breaks.  Otherwise, we leave it be.
390             if (ch == '\\') {
391                 // escape at the very end?  Just add the character.
392                 if (i == length - 1) {
393                     newString.append(ch);
394                 }
395                 else {
396                     int nextChar = s.charAt(i + 1);
397 
398                     // naked newline?  Add the new line to the buffer, and skip the escape char.
399                     if (nextChar == '\n') {
400                         newString.append('\n');
401                         i++;
402                     }
403                     else if (nextChar == '\r') {
404                         // just the CR left?  Add it, removing the escape.
405                         if (i == length - 2 || s.charAt(i + 2) != '\r') {
406                             newString.append('\r');
407                             i++;
408                         }
409                         else {
410                             // toss the escape, add both parts of the CRLF, and skip over two chars.
411                             newString.append('\r');
412                             newString.append('\n');
413                             i += 2;
414                         }
415                     }
416                     else {
417                         // an escape for another purpose, just copy it over.
418                         newString.append(ch);
419                     }
420                 }
421             }
422             // we have an unescaped line break
423             else if (ch == '\n' || ch == '\r') {
424                 // remember the position in case we need to backtrack.
425                 int lineBreak = i;
426                 boolean CRLF = false;
427 
428                 if (ch == '\r') {
429                     // check to see if we need to step over this.
430                     if (i < length - 1 && s.charAt(i + 1) == '\n') {
431                         i++;
432                         // flag the type so we know what we might need to preserve.
433                         CRLF = true;
434                     }
435                 }
436 
437                 // get a temp position scanner.
438                 int scan = i + 1;
439 
440                 // does a blank follow this new line?  we need to scrap the new line and reduce the leading blanks
441                 // down to a single blank.
442                 if (scan < length && s.charAt(scan) == ' ') {
443                     // add the character
444                     newString.append(' ');
445 
446                     // scan over the rest of the blanks
447                     i = scan + 1;
448                     while (i < length && s.charAt(i) == ' ') {
449                         i++;
450                     }
451                     // we'll increment down below, so back up to the last blank as the current char.
452                     i--;
453                 }
454                 else {
455                     // we must keep this line break.  Append the appropriate style.
456                     if (CRLF) {
457                         newString.append("\r\n");
458                     }
459                     else {
460                         newString.append(ch);
461                     }
462                 }
463             }
464             else {
465                 // just a normal, ordinary character
466                 newString.append(ch);
467             }
468         }
469         return newString.toString();
470     }
471 }