001    /**
002     *
003     * Copyright 2003-2004 The Apache Software Foundation
004     *
005     *  Licensed under the Apache License, Version 2.0 (the "License");
006     *  you may not use this file except in compliance with the License.
007     *  You may obtain a copy of the License at
008     *
009     *     http://www.apache.org/licenses/LICENSE-2.0
010     *
011     *  Unless required by applicable law or agreed to in writing, software
012     *  distributed under the License is distributed on an "AS IS" BASIS,
013     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     *  See the License for the specific language governing permissions and
015     *  limitations under the License.
016     */
017    
018    package org.apache.geronimo.mail.util;
019    
020    import java.io.BufferedInputStream;
021    import java.io.InputStream;
022    import java.io.IOException;
023    
024    
025    /**
026     * Set of utility classes for handling common encoding-related
027     * manipulations.
028     */
029    public class ASCIIUtil {
030        private static final String MIME_FOLDTEXT = "mail.mime.foldtext";
031        private static final int FOLD_THRESHOLD = 76;
032    
033        /**
034         * Test to see if this string contains only US-ASCII (i.e., 7-bit
035         * ASCII) charactes.
036         *
037         * @param s      The test string.
038         *
039         * @return true if this is a valid 7-bit ASCII encoding, false if it
040         *         contains any non-US ASCII characters.
041         */
042        static public boolean isAscii(String s) {
043            for (int i = 0; i < s.length(); i++) {
044                if (!isAscii(s.charAt(i))) {
045                    return false;
046                }
047            }
048            return true;
049        }
050    
051        /**
052         * Test to see if a given character can be considered "valid" ASCII.
053         * The excluded characters are the control characters less than
054         * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and
055         * tab characters ARE considered value (all less than 32).
056         *
057         * @param ch     The test character.
058         *
059         * @return true if this character meets the "ascii-ness" criteria, false
060         *         otherwise.
061         */
062        static public boolean isAscii(int ch) {
063            // these are explicitly considered valid.
064            if (ch == '\r' || ch == '\n' || ch == '\t') {
065                return true;
066            }
067    
068            // anything else outside the range is just plain wrong.
069            if (ch >= 127 || ch < 32) {
070                return false;
071            }
072            return true;
073        }
074    
075    
076        /**
077         * Examine a stream of text and make a judgement on what encoding
078         * type should be used for the text.  Ideally, we want to use 7bit
079         * encoding to determine this, but we may need to use either quoted-printable
080         * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
081         *
082         * @param content     An input stream for the content we're examining.
083         *
084         * @exception IOException
085         */
086        public static String getTextTransferEncoding(InputStream content) throws IOException {
087    
088            // for efficiency, we'll read in blocks.
089            BufferedInputStream in = new BufferedInputStream(content, 4096);
090    
091            int span = 0;            // span of characters without a line break.
092            boolean containsLongLines = false;
093            int asciiChars = 0;
094            int nonAsciiChars = 0;
095    
096            while (true) {
097                int ch = in.read();
098                // if we hit an EOF here, go decide what type we've actually found.
099                if (ch == -1) {
100                    break;
101                }
102    
103                // we found a linebreak.  Reset the line length counters on either one.  We don't
104                // really need to validate here.
105                if (ch == '\n' || ch == '\r') {
106                    // hit a line end, reset our line length counter
107                    span = 0;
108                }
109                else {
110                    span++;
111                    // the text has long lines, we can't transfer this as unencoded text.
112                    if (span > 998) {
113                        containsLongLines = true;
114                    }
115    
116                    // non-ascii character, we have to transfer this in binary.
117                    if (!isAscii(ch)) {
118                        nonAsciiChars++;
119                    }
120                    else {
121                        asciiChars++;
122                    }
123                }
124            }
125    
126            // looking good so far, only valid chars here.
127            if (nonAsciiChars == 0) {
128                // does this contain long text lines?  We need to use a Q-P encoding which will
129                // be only slightly longer, but handles folding the longer lines.
130                if (containsLongLines) {
131                    return "quoted-printable";
132                }
133                else {
134                    // ideal!  Easiest one to handle.
135                    return "7bit";
136                }
137            }
138            else {
139                // mostly characters requiring encoding?  Base64 is our best bet.
140                if (nonAsciiChars > asciiChars) {
141                    return "base64";
142                }
143                else {
144                    // Q-P encoding will use fewer bytes than the full Base64.
145                    return "quoted-printable";
146                }
147            }
148        }
149    
150    
151        /**
152         * Examine a stream of text and make a judgement on what encoding
153         * type should be used for the text.  Ideally, we want to use 7bit
154         * encoding to determine this, but we may need to use either quoted-printable
155         * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
156         *
157         * @param content     A string for the content we're examining.
158         */
159        public static String getTextTransferEncoding(String content) {
160    
161            int asciiChars = 0;
162            int nonAsciiChars = 0;
163    
164            for (int i = 0; i < content.length(); i++) {
165                int ch = content.charAt(i);
166    
167                // non-ascii character, we have to transfer this in binary.
168                if (!isAscii(ch)) {
169                    nonAsciiChars++;
170                }
171                else {
172                    asciiChars++;
173                }
174            }
175    
176            // looking good so far, only valid chars here.
177            if (nonAsciiChars == 0) {
178                // ideal!  Easiest one to handle.
179                return "7bit";
180            }
181            else {
182                // mostly characters requiring encoding?  Base64 is our best bet.
183                if (nonAsciiChars > asciiChars) {
184                    return "base64";
185                }
186                else {
187                    // Q-P encoding will use fewer bytes than the full Base64.
188                    return "quoted-printable";
189                }
190            }
191        }
192    
193    
194        /**
195         * Determine if the transfer encoding looks like it might be
196         * valid ascii text, and thus transferable as 7bit code.  In
197         * order for this to be true, all characters must be valid
198         * 7-bit ASCII code AND all line breaks must be properly formed
199         * (JUST '\r\n' sequences).  7-bit transfers also
200         * typically have a line limit of 1000 bytes (998 + the CRLF), so any
201         * stretch of charactes longer than that will also force Base64 encoding.
202         *
203         * @param content     An input stream for the content we're examining.
204         *
205         * @exception IOException
206         */
207        public static String getBinaryTransferEncoding(InputStream content) throws IOException {
208    
209            // for efficiency, we'll read in blocks.
210            BufferedInputStream in = new BufferedInputStream(content, 4096);
211    
212            int previousChar = 0;
213            int span = 0;            // span of characters without a line break.
214    
215            while (true) {
216                int ch = in.read();
217                // if we hit an EOF here, we've only found valid text so far, so we can transfer this as
218                // 7-bit ascii.
219                if (ch == -1) {
220                    return "7bit";
221                }
222    
223                // we found a newline, this is only valid if the previous char was the '\r'
224                if (ch == '\n') {
225                    // malformed linebreak?  force this to base64 encoding.
226                    if (previousChar != '\r') {
227                        return "base64";
228                    }
229                    // hit a line end, reset our line length counter
230                    span = 0;
231                }
232                else {
233                    span++;
234                    // the text has long lines, we can't transfer this as unencoded text.
235                    if (span > 998) {
236                        return "base64";
237                    }
238    
239                    // non-ascii character, we have to transfer this in binary.
240                    if (!isAscii(ch)) {
241                        return "base64";
242                    }
243                }
244                previousChar = ch;
245            }
246        }
247    
248    
249        /**
250         * Perform RFC 2047 text folding on a string of text.
251         *
252         * @param used   The amount of text already "used up" on this line.  This is
253         *               typically the length of a message header that this text
254         *               get getting added to.
255         * @param s      The text to fold.
256         *
257         * @return The input text, with linebreaks inserted at appropriate fold points.
258         */
259        public static String fold(int used, String s) {
260            // if folding is disable, unfolding is also.  Return the string unchanged.
261            if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
262                return s;
263            }
264    
265            int end;
266    
267            // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs,
268            // and line break characters.
269            for (end = s.length() - 1; end >= 0; end--) {
270                int ch = s.charAt(end);
271                if (ch != ' ' && ch != '\t' ) {
272                    break;
273                }
274            }
275    
276            // did we actually find something to remove?  Shorten the String to the trimmed length
277            if (end != s.length() - 1) {
278                s = s.substring(0, end + 1);
279            }
280    
281            // does the string as it exists now not require folding?  We can just had that back right off.
282            if (s.length() + used <= FOLD_THRESHOLD) {
283                return s;
284            }
285    
286            // get a buffer for the length of the string, plus room for a few line breaks.
287            // these are soft line breaks, so we generally need more that just the line breaks (an escape +
288            // CR + LF + leading space on next line);
289            StringBuffer newString = new StringBuffer(s.length() + 8);
290    
291    
292            // now keep chopping this down until we've accomplished what we need.
293            while (used + s.length() > FOLD_THRESHOLD) {
294                int breakPoint = -1;
295                char breakChar = 0;
296    
297                // now scan for the next place where we can break.
298                for (int i = 0; i < s.length(); i++) {
299                    // have we passed the fold limit?
300                    if (used + i > FOLD_THRESHOLD) {
301                        // if we've already seen a blank, then stop now.  Otherwise
302                        // we keep going until we hit a fold point.
303                        if (breakPoint != -1) {
304                            break;
305                        }
306                    }
307                    char ch = s.charAt(i);
308    
309                    // a white space character?
310                    if (ch == ' ' || ch == '\t') {
311                        // this might be a run of white space, so skip over those now.
312                        breakPoint = i;
313                        // we need to maintain the same character type after the inserted linebreak.
314                        breakChar = ch;
315                        i++;
316                        while (i < s.length()) {
317                            ch = s.charAt(i);
318                            if (ch != ' ' && ch != '\t') {
319                                break;
320                            }
321                            i++;
322                        }
323                    }
324                    // found an embedded new line.  Escape this so that the unfolding process preserves it.
325                    else if (ch == '\n') {
326                        newString.append('\\');
327                        newString.append('\n');
328                    }
329                    else if (ch == '\r') {
330                        newString.append('\\');
331                        newString.append('\n');
332                        i++;
333                        // if this is a CRLF pair, add the second char also
334                        if (i < s.length() && s.charAt(i) == '\n') {
335                            newString.append('\r');
336                        }
337                    }
338    
339                }
340                // no fold point found, we punt, append the remainder and leave.
341                if (breakPoint == -1) {
342                    newString.append(s);
343                    return newString.toString();
344                }
345                newString.append(s.substring(0, breakPoint));
346                newString.append("\r\n");
347                newString.append(breakChar);
348                // chop the string
349                s = s.substring(breakPoint + 1);
350                // start again, and we've used the first char of the limit already with the whitespace char.
351                used = 1;
352            }
353    
354            // add on the remainder, and return
355            newString.append(s);
356            return newString.toString();
357        }
358    
359        /**
360         * Unfold a folded string.  The unfolding process will remove
361         * any line breaks that are not escaped and which are also followed
362         * by whitespace characters.
363         *
364         * @param s      The folded string.
365         *
366         * @return A new string with unfolding rules applied.
367         */
368        public static String unfold(String s) {
369            // if folding is disable, unfolding is also.  Return the string unchanged.
370            if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
371                return s;
372            }
373    
374            // if there are no line break characters in the string, we can just return this.
375            if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) {
376                return s;
377            }
378    
379            // we need to scan and fix things up.
380            int length = s.length();
381    
382            StringBuffer newString = new StringBuffer(length);
383    
384            // scan the entire string
385            for (int i = 0; i < length; i++) {
386                int ch = s.charAt(i);
387    
388                // we have a backslash.  In folded strings, escape characters are only processed as such if
389                // they preceed line breaks.  Otherwise, we leave it be.
390                if (ch == '\\') {
391                    // escape at the very end?  Just add the character.
392                    if (i == length - 1) {
393                        newString.append(ch);
394                    }
395                    else {
396                        int nextChar = s.charAt(i + 1);
397    
398                        // naked newline?  Add the new line to the buffer, and skip the escape char.
399                        if (nextChar == '\n') {
400                            newString.append('\n');
401                            i++;
402                        }
403                        else if (nextChar == '\r') {
404                            // just the CR left?  Add it, removing the escape.
405                            if (i == length - 2 || s.charAt(i + 2) != '\r') {
406                                newString.append('\r');
407                                i++;
408                            }
409                            else {
410                                // toss the escape, add both parts of the CRLF, and skip over two chars.
411                                newString.append('\r');
412                                newString.append('\n');
413                                i += 2;
414                            }
415                        }
416                        else {
417                            // an escape for another purpose, just copy it over.
418                            newString.append(ch);
419                        }
420                    }
421                }
422                // we have an unescaped line break
423                else if (ch == '\n' || ch == '\r') {
424                    // remember the position in case we need to backtrack.
425                    int lineBreak = i;
426                    boolean CRLF = false;
427    
428                    if (ch == '\r') {
429                        // check to see if we need to step over this.
430                        if (i < length - 1 && s.charAt(i + 1) == '\n') {
431                            i++;
432                            // flag the type so we know what we might need to preserve.
433                            CRLF = true;
434                        }
435                    }
436    
437                    // get a temp position scanner.
438                    int scan = i + 1;
439    
440                    // does a blank follow this new line?  we need to scrap the new line and reduce the leading blanks
441                    // down to a single blank.
442                    if (scan < length && s.charAt(scan) == ' ') {
443                        // add the character
444                        newString.append(' ');
445    
446                        // scan over the rest of the blanks
447                        i = scan + 1;
448                        while (i < length && s.charAt(i) == ' ') {
449                            i++;
450                        }
451                        // we'll increment down below, so back up to the last blank as the current char.
452                        i--;
453                    }
454                    else {
455                        // we must keep this line break.  Append the appropriate style.
456                        if (CRLF) {
457                            newString.append("\r\n");
458                        }
459                        else {
460                            newString.append(ch);
461                        }
462                    }
463                }
464                else {
465                    // just a normal, ordinary character
466                    newString.append(ch);
467                }
468            }
469            return newString.toString();
470        }
471    }