ASCIIUtil xref

View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *  http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.geronimo.mail.util;
21  
22  import java.io.BufferedInputStream;
23  import java.io.InputStream;
24  import java.io.IOException;
25  
26  /**
27   * Set of utility classes for handling common encoding-related
28   * manipulations.
29   */
30  public class ASCIIUtil {
31  
32      /**
33       * Test to see if this string contains only US-ASCII (i.e., 7-bit
34       * ASCII) charactes.
35       *
36       * @param s      The test string.
37       *
38       * @return true if this is a valid 7-bit ASCII encoding, false if it
39       *         contains any non-US ASCII characters.
40       */
41      static public boolean isAscii(String s) {
42          for (int i = 0; i < s.length(); i++) {
43              if (!isAscii(s.charAt(i))) {
44                  return false;
45              }
46          }
47          return true;
48      }
49  
50      /**
51       * Test to see if a given character can be considered "valid" ASCII.
52       * The excluded characters are the control characters less than
53       * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and
54       * tab characters ARE considered value (all less than 32).
55       *
56       * @param ch     The test character.
57       *
58       * @return true if this character meets the "ascii-ness" criteria, false
59       *         otherwise.
60       */
61      static public boolean isAscii(int ch) {
62          // these are explicitly considered valid.
63          if (ch == '\r' || ch == '\n' || ch == '\t') {
64              return true;
65          }
66  
67          // anything else outside the range is just plain wrong.
68          if (ch >= 127 || ch < 32) {
69              return false;
70          }
71          return true;
72      }
73  
74  
75      /**
76       * Examine a stream of text and make a judgement on what encoding
77       * type should be used for the text.  Ideally, we want to use 7bit
78       * encoding to determine this, but we may need to use either quoted-printable
79       * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
80       *
81       * @param content     An input stream for the content we're examining.
82       *
83       * @exception IOException
84       */
85      public static String getTextTransferEncoding(InputStream content) throws IOException {
86  
87          // for efficiency, we'll read in blocks.
88          BufferedInputStream in = new BufferedInputStream(content, 4096);
89  
90          int span = 0;            // span of characters without a line break.
91          boolean containsLongLines = false;
92          int asciiChars = 0;
93          int nonAsciiChars = 0;
94  
95          while (true) {
96              int ch = in.read();
97              // if we hit an EOF here, go decide what type we've actually found.
98              if (ch == -1) {
99                  break;
100             }
101 
102             // we found a linebreak.  Reset the line length counters on either one.  We don't
103             // really need to validate here.
104             if (ch == '\n' || ch == '\r') {
105                 // hit a line end, reset our line length counter
106                 span = 0;
107             }
108             else {
109                 span++;
110                 // the text has long lines, we can't transfer this as unencoded text.
111                 if (span > 998) {
112                     containsLongLines = true;
113                 }
114 
115                 // non-ascii character, we have to transfer this in binary.
116                 if (!isAscii(ch)) {
117                     nonAsciiChars++;
118                 }
119                 else {
120                     asciiChars++;
121                 }
122             }
123         }
124 
125         // looking good so far, only valid chars here.
126         if (nonAsciiChars == 0) {
127             // does this contain long text lines?  We need to use a Q-P encoding which will
128             // be only slightly longer, but handles folding the longer lines.
129             if (containsLongLines) {
130                 return "quoted-printable";
131             }
132             else {
133                 // ideal!  Easiest one to handle.
134                 return "7bit";
135             }
136         }
137         else {
138             // mostly characters requiring encoding?  Base64 is our best bet.
139             if (nonAsciiChars > asciiChars) {
140                 return "base64";
141             }
142             else {
143                 // Q-P encoding will use fewer bytes than the full Base64.
144                 return "quoted-printable";
145             }
146         }
147     }
148 
149 
150     /**
151      * Examine a stream of text and make a judgement on what encoding
152      * type should be used for the text.  Ideally, we want to use 7bit
153      * encoding to determine this, but we may need to use either quoted-printable
154      * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
155      *
156      * @param content     A string for the content we're examining.
157      */
158     public static String getTextTransferEncoding(String content) {
159 
160         int asciiChars = 0;
161         int nonAsciiChars = 0;
162 
163         for (int i = 0; i < content.length(); i++) {
164             int ch = content.charAt(i);
165 
166             // non-ascii character, we have to transfer this in binary.
167             if (!isAscii(ch)) {
168                 nonAsciiChars++;
169             }
170             else {
171                 asciiChars++;
172             }
173         }
174 
175         // looking good so far, only valid chars here.
176         if (nonAsciiChars == 0) {
177             // ideal!  Easiest one to handle.
178             return "7bit";
179         }
180         else {
181             // mostly characters requiring encoding?  Base64 is our best bet.
182             if (nonAsciiChars > asciiChars) {
183                 return "base64";
184             }
185             else {
186                 // Q-P encoding will use fewer bytes than the full Base64.
187                 return "quoted-printable";
188             }
189         }
190     }
191 
192 
193     /**
194      * Determine if the transfer encoding looks like it might be
195      * valid ascii text, and thus transferable as 7bit code.  In
196      * order for this to be true, all characters must be valid
197      * 7-bit ASCII code AND all line breaks must be properly formed
198      * (JUST '\r\n' sequences).  7-bit transfers also
199      * typically have a line limit of 1000 bytes (998 + the CRLF), so any
200      * stretch of charactes longer than that will also force Base64 encoding.
201      *
202      * @param content     An input stream for the content we're examining.
203      *
204      * @exception IOException
205      */
206     public static String getBinaryTransferEncoding(InputStream content) throws IOException {
207 
208         // for efficiency, we'll read in blocks.
209         BufferedInputStream in = new BufferedInputStream(content, 4096);
210 
211         int previousChar = 0;
212         int span = 0;            // span of characters without a line break.
213 
214         while (true) {
215             int ch = in.read();
216             // if we hit an EOF here, we've only found valid text so far, so we can transfer this as
217             // 7-bit ascii.
218             if (ch == -1) {
219                 return "7bit";
220             }
221 
222             // we found a newline, this is only valid if the previous char was the '\r'
223             if (ch == '\n') {
224                 // malformed linebreak?  force this to base64 encoding.
225                 if (previousChar != '\r') {
226                     return "base64";
227                 }
228                 // hit a line end, reset our line length counter
229                 span = 0;
230             }
231             else {
232                 span++;
233                 // the text has long lines, we can't transfer this as unencoded text.
234                 if (span > 998) {
235                     return "base64";
236                 }
237 
238                 // non-ascii character, we have to transfer this in binary.
239                 if (!isAscii(ch)) {
240                     return "base64";
241                 }
242             }
243             previousChar = ch;
244         }
245     }
246 }