001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *  http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing,
013     * software distributed under the License is distributed on an
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015     * KIND, either express or implied.  See the License for the
016     * specific language governing permissions and limitations
017     * under the License.
018     */
019    
020    package org.apache.geronimo.mail.util;
021    
022    import java.io.BufferedInputStream;
023    import java.io.InputStream;
024    import java.io.IOException;
025    
026    /**
027     * Set of utility classes for handling common encoding-related
028     * manipulations.
029     */
030    public class ASCIIUtil {
031    
032        /**
033         * Test to see if this string contains only US-ASCII (i.e., 7-bit
034         * ASCII) charactes.
035         *
036         * @param s      The test string.
037         *
038         * @return true if this is a valid 7-bit ASCII encoding, false if it
039         *         contains any non-US ASCII characters.
040         */
041        static public boolean isAscii(String s) {
042            for (int i = 0; i < s.length(); i++) {
043                if (!isAscii(s.charAt(i))) {
044                    return false;
045                }
046            }
047            return true;
048        }
049    
050        /**
051         * Test to see if a given character can be considered "valid" ASCII.
052         * The excluded characters are the control characters less than
053         * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and
054         * tab characters ARE considered value (all less than 32).
055         *
056         * @param ch     The test character.
057         *
058         * @return true if this character meets the "ascii-ness" criteria, false
059         *         otherwise.
060         */
061        static public boolean isAscii(int ch) {
062            // these are explicitly considered valid.
063            if (ch == '\r' || ch == '\n' || ch == '\t') {
064                return true;
065            }
066    
067            // anything else outside the range is just plain wrong.
068            if (ch >= 127 || ch < 32) {
069                return false;
070            }
071            return true;
072        }
073    
074    
075        /**
076         * Examine a stream of text and make a judgement on what encoding
077         * type should be used for the text.  Ideally, we want to use 7bit
078         * encoding to determine this, but we may need to use either quoted-printable
079         * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
080         *
081         * @param content     An input stream for the content we're examining.
082         *
083         * @exception IOException
084         */
085        public static String getTextTransferEncoding(InputStream content) throws IOException {
086    
087            // for efficiency, we'll read in blocks.
088            BufferedInputStream in = new BufferedInputStream(content, 4096);
089    
090            int span = 0;            // span of characters without a line break.
091            boolean containsLongLines = false;
092            int asciiChars = 0;
093            int nonAsciiChars = 0;
094    
095            while (true) {
096                int ch = in.read();
097                // if we hit an EOF here, go decide what type we've actually found.
098                if (ch == -1) {
099                    break;
100                }
101    
102                // we found a linebreak.  Reset the line length counters on either one.  We don't
103                // really need to validate here.
104                if (ch == '\n' || ch == '\r') {
105                    // hit a line end, reset our line length counter
106                    span = 0;
107                }
108                else {
109                    span++;
110                    // the text has long lines, we can't transfer this as unencoded text.
111                    if (span > 998) {
112                        containsLongLines = true;
113                    }
114    
115                    // non-ascii character, we have to transfer this in binary.
116                    if (!isAscii(ch)) {
117                        nonAsciiChars++;
118                    }
119                    else {
120                        asciiChars++;
121                    }
122                }
123            }
124    
125            // looking good so far, only valid chars here.
126            if (nonAsciiChars == 0) {
127                // does this contain long text lines?  We need to use a Q-P encoding which will
128                // be only slightly longer, but handles folding the longer lines.
129                if (containsLongLines) {
130                    return "quoted-printable";
131                }
132                else {
133                    // ideal!  Easiest one to handle.
134                    return "7bit";
135                }
136            }
137            else {
138                // mostly characters requiring encoding?  Base64 is our best bet.
139                if (nonAsciiChars > asciiChars) {
140                    return "base64";
141                }
142                else {
143                    // Q-P encoding will use fewer bytes than the full Base64.
144                    return "quoted-printable";
145                }
146            }
147        }
148    
149    
150        /**
151         * Examine a stream of text and make a judgement on what encoding
152         * type should be used for the text.  Ideally, we want to use 7bit
153         * encoding to determine this, but we may need to use either quoted-printable
154         * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
155         *
156         * @param content     A string for the content we're examining.
157         */
158        public static String getTextTransferEncoding(String content) {
159    
160            int asciiChars = 0;
161            int nonAsciiChars = 0;
162    
163            for (int i = 0; i < content.length(); i++) {
164                int ch = content.charAt(i);
165    
166                // non-ascii character, we have to transfer this in binary.
167                if (!isAscii(ch)) {
168                    nonAsciiChars++;
169                }
170                else {
171                    asciiChars++;
172                }
173            }
174    
175            // looking good so far, only valid chars here.
176            if (nonAsciiChars == 0) {
177                // ideal!  Easiest one to handle.
178                return "7bit";
179            }
180            else {
181                // mostly characters requiring encoding?  Base64 is our best bet.
182                if (nonAsciiChars > asciiChars) {
183                    return "base64";
184                }
185                else {
186                    // Q-P encoding will use fewer bytes than the full Base64.
187                    return "quoted-printable";
188                }
189            }
190        }
191    
192    
193        /**
194         * Determine if the transfer encoding looks like it might be
195         * valid ascii text, and thus transferable as 7bit code.  In
196         * order for this to be true, all characters must be valid
197         * 7-bit ASCII code AND all line breaks must be properly formed
198         * (JUST '\r\n' sequences).  7-bit transfers also
199         * typically have a line limit of 1000 bytes (998 + the CRLF), so any
200         * stretch of charactes longer than that will also force Base64 encoding.
201         *
202         * @param content     An input stream for the content we're examining.
203         *
204         * @exception IOException
205         */
206        public static String getBinaryTransferEncoding(InputStream content) throws IOException {
207    
208            // for efficiency, we'll read in blocks.
209            BufferedInputStream in = new BufferedInputStream(content, 4096);
210    
211            int previousChar = 0;
212            int span = 0;            // span of characters without a line break.
213    
214            while (true) {
215                int ch = in.read();
216                // if we hit an EOF here, we've only found valid text so far, so we can transfer this as
217                // 7-bit ascii.
218                if (ch == -1) {
219                    return "7bit";
220                }
221    
222                // we found a newline, this is only valid if the previous char was the '\r'
223                if (ch == '\n') {
224                    // malformed linebreak?  force this to base64 encoding.
225                    if (previousChar != '\r') {
226                        return "base64";
227                    }
228                    // hit a line end, reset our line length counter
229                    span = 0;
230                }
231                else {
232                    span++;
233                    // the text has long lines, we can't transfer this as unencoded text.
234                    if (span > 998) {
235                        return "base64";
236                    }
237    
238                    // non-ascii character, we have to transfer this in binary.
239                    if (!isAscii(ch)) {
240                        return "base64";
241                    }
242                }
243                previousChar = ch;
244            }
245        }
246    }