HeaderTokenizer xref

View Javadoc

1   /**
2    *
3    * Copyright 2003-2006 The Apache Software Foundation
4    *
5    *  Licensed under the Apache License, Version 2.0 (the "License");
6    *  you may not use this file except in compliance with the License.
7    *  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  
18  package javax.mail.internet;
19  
20  /**
21   * @version $Rev: 421852 $ $Date: 2006-07-14 03:02:19 -0700 (Fri, 14 Jul 2006) $
22   */
23  public class HeaderTokenizer {
24      public static class Token {
25          // Constant values from J2SE 1.4 API Docs (Constant values)
26          public static final int ATOM = -1;
27          public static final int COMMENT = -3;
28          public static final int EOF = -4;
29          public static final int QUOTEDSTRING = -2;
30          private int _type;
31          private String _value;
32  
33          public Token(int type, String value) {
34              _type = type;
35              _value = value;
36          }
37  
38          public int getType() {
39              return _type;
40          }
41  
42          public String getValue() {
43              return _value;
44          }
45      }
46  
47      private static final Token EOF = new Token(Token.EOF, null);
48      // characters not allowed in MIME
49      public static final String MIME = "()<>@,;:\\\"\t []/?=";
50      // charaters not allowed in RFC822
51      public static final String RFC822 = "()<>@,;:\\\"\t .[]";
52      private static final String WHITE = " \t\n\r";
53      private String _delimiters;
54      private String _header;
55      private boolean _skip;
56      private int pos;
57  
58      public HeaderTokenizer(String header) {
59          this(header, RFC822);
60      }
61  
62      public HeaderTokenizer(String header, String delimiters) {
63          this(header, delimiters, true);
64      }
65  
66      public HeaderTokenizer(String header,
67                             String delimiters,
68                             boolean skipComments) {
69          _skip = skipComments;
70          _header = header;
71          _delimiters = delimiters;
72      }
73  
74      public String getRemainder() {
75          return _header.substring(pos);
76      }
77  
78      public Token next() throws ParseException {
79          return readToken();
80      }
81  
82      public Token peek() throws ParseException {
83          int start = pos;
84          try {
85              return readToken();
86          } finally {
87              pos = start;
88          }
89      }
90  
91      /**
92       * Read an ATOM token from the parsed header.
93       *
94       * @return A token containing the value of the atom token.
95       */
96      private Token readAtomicToken() {
97          // skip to next delimiter
98          int start = pos;
99          while (++pos < _header.length()) {
100             // break on the first non-atom character.
101             char ch = _header.charAt(pos);
102             if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) {
103                 break;
104             }
105         }
106 
107         return new Token(Token.ATOM, _header.substring(start, pos));
108     }
109 
110     /**
111      * Read the next token from the header.
112      *
113      * @return The next token from the header.  White space is skipped, and comment
114      *         tokens are also skipped if indicated.
115      * @exception ParseException
116      */
117     private Token readToken() throws ParseException {
118         if (pos >= _header.length()) {
119             return EOF;
120         } else {
121             char c = _header.charAt(pos);
122             // comment token...read and skip over this
123             if (c == '(') {
124                 Token comment = readComment();
125                 if (_skip) {
126                     return readToken();
127                 } else {
128                     return comment;
129                 }
130                 // quoted literal
131             } else if (c == '\"') {
132                 return readQuotedString();
133             // white space, eat this and find a real token.
134             } else if (WHITE.indexOf(c) != -1) {
135                 eatWhiteSpace();
136                 return readToken();
137             // either a CTL or special.  These characters have a self-defining token type.
138             } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) {
139                 pos++;
140                 return new Token((int)c, String.valueOf(c));
141             } else {
142                 // start of an atom, parse it off.
143                 return readAtomicToken();
144             }
145         }
146     }
147 
148     /**
149      * Extract a substring from the header string and apply any
150      * escaping/folding rules to the string.
151      *
152      * @param start  The starting offset in the header.
153      * @param end    The header end offset + 1.
154      *
155      * @return The processed string value.
156      * @exception ParseException
157      */
158     private String getEscapedValue(int start, int end) throws ParseException {
159         StringBuffer value = new StringBuffer();
160 
161         for (int i = start; i < end; i++) {
162             char ch = _header.charAt(i);
163             // is this an escape character?
164             if (ch == '\\') {
165                 i++;
166                 if (i == end) {
167                     throw new ParseException("Invalid escape character");
168                 }
169                 value.append(_header.charAt(i));
170             }
171             // line breaks are ignored, except for naked '\n' characters, which are consider
172             // parts of linear whitespace.
173             else if (ch == '\r') {
174                 // see if this is a CRLF sequence, and skip the second if it is.
175                 if (i < end - 1 && _header.charAt(i + 1) == '\n') {
176                     i++;
177                 }
178             }
179             else {
180                 // just append the ch value.
181                 value.append(ch);
182             }
183         }
184         return value.toString();
185     }
186 
187     /**
188      * Read a comment from the header, applying nesting and escape
189      * rules to the content.
190      *
191      * @return A comment token with the token value.
192      * @exception ParseException
193      */
194     private Token readComment() throws ParseException {
195         int start = pos + 1;
196         int nesting = 1;
197 
198         boolean requiresEscaping = false;
199 
200         // skip to end of comment/string
201         while (++pos < _header.length()) {
202             char ch = _header.charAt(pos);
203             if (ch == ')') {
204                 nesting--;
205                 if (nesting == 0) {
206                     break;
207                 }
208             }
209             else if (ch == '(') {
210                 nesting++;
211             }
212             else if (ch == '\\') {
213                 pos++;
214                 requiresEscaping = true;
215             }
216             // we need to process line breaks also
217             else if (ch == '\r') {
218                 requiresEscaping = true;
219             }
220         }
221 
222         if (nesting != 0) {
223             throw new ParseException("Unbalanced comments");
224         }
225 
226         String value;
227         if (requiresEscaping) {
228             value = getEscapedValue(start, pos);
229         }
230         else {
231             value = _header.substring(start, pos++);
232         }
233         return new Token(Token.COMMENT, value);
234     }
235 
236     /**
237      * Parse out a quoted string from the header, applying escaping
238      * rules to the value.
239      *
240      * @return The QUOTEDSTRING token with the value.
241      * @exception ParseException
242      */
243     private Token readQuotedString() throws ParseException {
244         int start = pos+1;
245         boolean requiresEscaping = false;
246 
247         // skip to end of comment/string
248         while (++pos < _header.length()) {
249             char ch = _header.charAt(pos);
250             if (ch == '"') {
251                 String value;
252                 if (requiresEscaping) {
253                     value = getEscapedValue(start, pos);
254                 }
255                 else {
256                     value = _header.substring(start, pos++);
257                 }
258                 return new Token(Token.QUOTEDSTRING, value);
259             }
260             else if (ch == '\\') {
261                 pos++;
262                 requiresEscaping = true;
263             }
264             // we need to process line breaks also
265             else if (ch == '\r') {
266                 requiresEscaping = true;
267             }
268         }
269 
270         throw new ParseException("Missing '\"'");
271     }
272 
273     /**
274      * Skip white space in the token string.
275      */
276     private void eatWhiteSpace() {
277         // skip to end of whitespace
278         while (++pos < _header.length()
279                 && WHITE.indexOf(_header.charAt(pos)) != -1)
280             ;
281     }
282 }