HeaderTokenizer xref

View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *  http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package javax.mail.internet;
21  
22  /**
23   * @version $Rev: 729233 $ $Date: 2008-12-24 00:08:45 -0500 (Wed, 24 Dec 2008) $
24   */
25  public class HeaderTokenizer {
26      public static class Token {
27          // Constant values from J2SE 1.4 API Docs (Constant values)
28          public static final int ATOM = -1;
29          public static final int COMMENT = -3;
30          public static final int EOF = -4;
31          public static final int QUOTEDSTRING = -2;
32          private int _type;
33          private String _value;
34  
35          public Token(int type, String value) {
36              _type = type;
37              _value = value;
38          }
39  
40          public int getType() {
41              return _type;
42          }
43  
44          public String getValue() {
45              return _value;
46          }
47      }
48  
49      private static final Token EOF = new Token(Token.EOF, null);
50      // characters not allowed in MIME
51      public static final String MIME = "()<>@,;:\\\"\t []/?=";
52      // charaters not allowed in RFC822
53      public static final String RFC822 = "()<>@,;:\\\"\t .[]";
54      private static final String WHITE = " \t\n\r";
55      private String _delimiters;
56      private String _header;
57      private boolean _skip;
58      private int pos;
59  
60      public HeaderTokenizer(String header) {
61          this(header, RFC822);
62      }
63  
64      public HeaderTokenizer(String header, String delimiters) {
65          this(header, delimiters, true);
66      }
67  
68      public HeaderTokenizer(String header,
69                             String delimiters,
70                             boolean skipComments) {
71          _skip = skipComments;
72          _header = header;
73          _delimiters = delimiters;
74      }
75  
76      public String getRemainder() {
77          return _header.substring(pos);
78      }
79  
80      public Token next() throws ParseException {
81          return readToken();
82      }
83  
84      public Token peek() throws ParseException {
85          int start = pos;
86          try {
87              return readToken();
88          } finally {
89              pos = start;
90          }
91      }
92  
93      /**
94       * Read an ATOM token from the parsed header.
95       *
96       * @return A token containing the value of the atom token.
97       */
98      private Token readAtomicToken() {
99          // skip to next delimiter
100         int start = pos;
101         while (++pos < _header.length()) {
102             // break on the first non-atom character.
103             char ch = _header.charAt(pos);
104             if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) {
105                 break;
106             }
107         }
108 
109         return new Token(Token.ATOM, _header.substring(start, pos));
110     }
111 
112     /**
113      * Read the next token from the header.
114      *
115      * @return The next token from the header.  White space is skipped, and comment
116      *         tokens are also skipped if indicated.
117      * @exception ParseException
118      */
119     private Token readToken() throws ParseException {
120         if (pos >= _header.length()) {
121             return EOF;
122         } else {
123             char c = _header.charAt(pos);
124             // comment token...read and skip over this
125             if (c == '(') {
126                 Token comment = readComment();
127                 if (_skip) {
128                     return readToken();
129                 } else {
130                     return comment;
131                 }
132                 // quoted literal
133             } else if (c == '\"') {
134                 return readQuotedString();
135             // white space, eat this and find a real token.
136             } else if (WHITE.indexOf(c) != -1) {
137                 eatWhiteSpace();
138                 return readToken();
139             // either a CTL or special.  These characters have a self-defining token type.
140             } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) {
141                 pos++;
142                 return new Token((int)c, String.valueOf(c));
143             } else {
144                 // start of an atom, parse it off.
145                 return readAtomicToken();
146             }
147         }
148     }
149 
150     /**
151      * Extract a substring from the header string and apply any
152      * escaping/folding rules to the string.
153      *
154      * @param start  The starting offset in the header.
155      * @param end    The header end offset + 1.
156      *
157      * @return The processed string value.
158      * @exception ParseException
159      */
160     private String getEscapedValue(int start, int end) throws ParseException {
161         StringBuffer value = new StringBuffer();
162 
163         for (int i = start; i < end; i++) {
164             char ch = _header.charAt(i);
165             // is this an escape character?
166             if (ch == '\\') {
167                 i++;
168                 if (i == end) {
169                     throw new ParseException("Invalid escape character");
170                 }
171                 value.append(_header.charAt(i));
172             }
173             // line breaks are ignored, except for naked '\n' characters, which are consider
174             // parts of linear whitespace.
175             else if (ch == '\r') {
176                 // see if this is a CRLF sequence, and skip the second if it is.
177                 if (i < end - 1 && _header.charAt(i + 1) == '\n') {
178                     i++;
179                 }
180             }
181             else {
182                 // just append the ch value.
183                 value.append(ch);
184             }
185         }
186         return value.toString();
187     }
188 
189     /**
190      * Read a comment from the header, applying nesting and escape
191      * rules to the content.
192      *
193      * @return A comment token with the token value.
194      * @exception ParseException
195      */
196     private Token readComment() throws ParseException {
197         int start = pos + 1;
198         int nesting = 1;
199 
200         boolean requiresEscaping = false;
201 
202         // skip to end of comment/string
203         while (++pos < _header.length()) {
204             char ch = _header.charAt(pos);
205             if (ch == ')') {
206                 nesting--;
207                 if (nesting == 0) {
208                     break;
209                 }
210             }
211             else if (ch == '(') {
212                 nesting++;
213             }
214             else if (ch == '\\') {
215                 pos++;
216                 requiresEscaping = true;
217             }
218             // we need to process line breaks also
219             else if (ch == '\r') {
220                 requiresEscaping = true;
221             }
222         }
223 
224         if (nesting != 0) {
225             throw new ParseException("Unbalanced comments");
226         }
227 
228         String value;
229         if (requiresEscaping) {
230             value = getEscapedValue(start, pos);
231         }
232         else {
233             value = _header.substring(start, pos++);
234         }
235         return new Token(Token.COMMENT, value);
236     }
237 
238     /**
239      * Parse out a quoted string from the header, applying escaping
240      * rules to the value.
241      *
242      * @return The QUOTEDSTRING token with the value.
243      * @exception ParseException
244      */
245     private Token readQuotedString() throws ParseException {
246         int start = pos+1;
247         boolean requiresEscaping = false;
248 
249         // skip to end of comment/string
250         while (++pos < _header.length()) {
251             char ch = _header.charAt(pos);
252             if (ch == '"') {
253                 String value;
254                 if (requiresEscaping) {
255                     value = getEscapedValue(start, pos++);
256                 }
257                 else {
258                     value = _header.substring(start, pos++);
259                 }
260                 return new Token(Token.QUOTEDSTRING, value);
261             }
262             else if (ch == '\\') {
263                 pos++;
264                 requiresEscaping = true;
265             }
266             // we need to process line breaks also
267             else if (ch == '\r') {
268                 requiresEscaping = true;
269             }
270         }
271 
272         throw new ParseException("Missing '\"'");
273     }
274 
275     /**
276      * Skip white space in the token string.
277      */
278     private void eatWhiteSpace() {
279         // skip to end of whitespace
280         while (++pos < _header.length()
281                 && WHITE.indexOf(_header.charAt(pos)) != -1)
282             ;
283     }
284 }