001    /**
002     *
003     * Copyright 2003-2006 The Apache Software Foundation
004     *
005     *  Licensed under the Apache License, Version 2.0 (the "License");
006     *  you may not use this file except in compliance with the License.
007     *  You may obtain a copy of the License at
008     *
009     *     http://www.apache.org/licenses/LICENSE-2.0
010     *
011     *  Unless required by applicable law or agreed to in writing, software
012     *  distributed under the License is distributed on an "AS IS" BASIS,
013     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     *  See the License for the specific language governing permissions and
015     *  limitations under the License.
016     */
017    
018    package org.apache.geronimo.mail.util;
019    
020    import java.io.BufferedInputStream;
021    import java.io.InputStream;
022    import java.io.IOException;
023    
024    
025    /**
026     * Set of utility classes for handling common encoding-related
027     * manipulations.
028     */
029    public class ASCIIUtil {
030    
031        /**
032         * Test to see if this string contains only US-ASCII (i.e., 7-bit
033         * ASCII) charactes.
034         *
035         * @param s      The test string.
036         *
037         * @return true if this is a valid 7-bit ASCII encoding, false if it
038         *         contains any non-US ASCII characters.
039         */
040        static public boolean isAscii(String s) {
041            for (int i = 0; i < s.length(); i++) {
042                if (!isAscii(s.charAt(i))) {
043                    return false;
044                }
045            }
046            return true;
047        }
048    
049        /**
050         * Test to see if a given character can be considered "valid" ASCII.
051         * The excluded characters are the control characters less than
052         * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and
053         * tab characters ARE considered value (all less than 32).
054         *
055         * @param ch     The test character.
056         *
057         * @return true if this character meets the "ascii-ness" criteria, false
058         *         otherwise.
059         */
060        static public boolean isAscii(int ch) {
061            // these are explicitly considered valid.
062            if (ch == '\r' || ch == '\n' || ch == '\t') {
063                return true;
064            }
065    
066            // anything else outside the range is just plain wrong.
067            if (ch >= 127 || ch < 32) {
068                return false;
069            }
070            return true;
071        }
072    
073    
074        /**
075         * Examine a stream of text and make a judgement on what encoding
076         * type should be used for the text.  Ideally, we want to use 7bit
077         * encoding to determine this, but we may need to use either quoted-printable
078         * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
079         *
080         * @param content     An input stream for the content we're examining.
081         *
082         * @exception IOException
083         */
084        public static String getTextTransferEncoding(InputStream content) throws IOException {
085    
086            // for efficiency, we'll read in blocks.
087            BufferedInputStream in = new BufferedInputStream(content, 4096);
088    
089            int span = 0;            // span of characters without a line break.
090            boolean containsLongLines = false;
091            int asciiChars = 0;
092            int nonAsciiChars = 0;
093    
094            while (true) {
095                int ch = in.read();
096                // if we hit an EOF here, go decide what type we've actually found.
097                if (ch == -1) {
098                    break;
099                }
100    
101                // we found a linebreak.  Reset the line length counters on either one.  We don't
102                // really need to validate here.
103                if (ch == '\n' || ch == '\r') {
104                    // hit a line end, reset our line length counter
105                    span = 0;
106                }
107                else {
108                    span++;
109                    // the text has long lines, we can't transfer this as unencoded text.
110                    if (span > 998) {
111                        containsLongLines = true;
112                    }
113    
114                    // non-ascii character, we have to transfer this in binary.
115                    if (!isAscii(ch)) {
116                        nonAsciiChars++;
117                    }
118                    else {
119                        asciiChars++;
120                    }
121                }
122            }
123    
124            // looking good so far, only valid chars here.
125            if (nonAsciiChars == 0) {
126                // does this contain long text lines?  We need to use a Q-P encoding which will
127                // be only slightly longer, but handles folding the longer lines.
128                if (containsLongLines) {
129                    return "quoted-printable";
130                }
131                else {
132                    // ideal!  Easiest one to handle.
133                    return "7bit";
134                }
135            }
136            else {
137                // mostly characters requiring encoding?  Base64 is our best bet.
138                if (nonAsciiChars > asciiChars) {
139                    return "base64";
140                }
141                else {
142                    // Q-P encoding will use fewer bytes than the full Base64.
143                    return "quoted-printable";
144                }
145            }
146        }
147    
148    
149        /**
150         * Examine a stream of text and make a judgement on what encoding
151         * type should be used for the text.  Ideally, we want to use 7bit
152         * encoding to determine this, but we may need to use either quoted-printable
153         * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
154         *
155         * @param content     A string for the content we're examining.
156         */
157        public static String getTextTransferEncoding(String content) {
158    
159            int asciiChars = 0;
160            int nonAsciiChars = 0;
161    
162            for (int i = 0; i < content.length(); i++) {
163                int ch = content.charAt(i);
164    
165                // non-ascii character, we have to transfer this in binary.
166                if (!isAscii(ch)) {
167                    nonAsciiChars++;
168                }
169                else {
170                    asciiChars++;
171                }
172            }
173    
174            // looking good so far, only valid chars here.
175            if (nonAsciiChars == 0) {
176                // ideal!  Easiest one to handle.
177                return "7bit";
178            }
179            else {
180                // mostly characters requiring encoding?  Base64 is our best bet.
181                if (nonAsciiChars > asciiChars) {
182                    return "base64";
183                }
184                else {
185                    // Q-P encoding will use fewer bytes than the full Base64.
186                    return "quoted-printable";
187                }
188            }
189        }
190    
191    
192        /**
193         * Determine if the transfer encoding looks like it might be
194         * valid ascii text, and thus transferable as 7bit code.  In
195         * order for this to be true, all characters must be valid
196         * 7-bit ASCII code AND all line breaks must be properly formed
197         * (JUST '\r\n' sequences).  7-bit transfers also
198         * typically have a line limit of 1000 bytes (998 + the CRLF), so any
199         * stretch of charactes longer than that will also force Base64 encoding.
200         *
201         * @param content     An input stream for the content we're examining.
202         *
203         * @exception IOException
204         */
205        public static String getBinaryTransferEncoding(InputStream content) throws IOException {
206    
207            // for efficiency, we'll read in blocks.
208            BufferedInputStream in = new BufferedInputStream(content, 4096);
209    
210            int previousChar = 0;
211            int span = 0;            // span of characters without a line break.
212    
213            while (true) {
214                int ch = in.read();
215                // if we hit an EOF here, we've only found valid text so far, so we can transfer this as
216                // 7-bit ascii.
217                if (ch == -1) {
218                    return "7bit";
219                }
220    
221                // we found a newline, this is only valid if the previous char was the '\r'
222                if (ch == '\n') {
223                    // malformed linebreak?  force this to base64 encoding.
224                    if (previousChar != '\r') {
225                        return "base64";
226                    }
227                    // hit a line end, reset our line length counter
228                    span = 0;
229                }
230                else {
231                    span++;
232                    // the text has long lines, we can't transfer this as unencoded text.
233                    if (span > 998) {
234                        return "base64";
235                    }
236    
237                    // non-ascii character, we have to transfer this in binary.
238                    if (!isAscii(ch)) {
239                        return "base64";
240                    }
241                }
242                previousChar = ch;
243            }
244        }
245    }