1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 package javax.mail.internet;
21
22 /**
23 * @version $Rev: 467553 $ $Date: 2006-10-25 00:01:51 -0400 (Wed, 25 Oct 2006) $
24 */
25 public class HeaderTokenizer {
26 public static class Token {
27 // Constant values from J2SE 1.4 API Docs (Constant values)
28 public static final int ATOM = -1;
29 public static final int COMMENT = -3;
30 public static final int EOF = -4;
31 public static final int QUOTEDSTRING = -2;
32 private int _type;
33 private String _value;
34
35 public Token(int type, String value) {
36 _type = type;
37 _value = value;
38 }
39
40 public int getType() {
41 return _type;
42 }
43
44 public String getValue() {
45 return _value;
46 }
47 }
48
49 private static final Token EOF = new Token(Token.EOF, null);
50 // characters not allowed in MIME
51 public static final String MIME = "()<>@,;:\\\"\t []/?=";
52 // charaters not allowed in RFC822
53 public static final String RFC822 = "()<>@,;:\\\"\t .[]";
54 private static final String WHITE = " \t\n\r";
55 private String _delimiters;
56 private String _header;
57 private boolean _skip;
58 private int pos;
59
60 public HeaderTokenizer(String header) {
61 this(header, RFC822);
62 }
63
64 public HeaderTokenizer(String header, String delimiters) {
65 this(header, delimiters, true);
66 }
67
68 public HeaderTokenizer(String header,
69 String delimiters,
70 boolean skipComments) {
71 _skip = skipComments;
72 _header = header;
73 _delimiters = delimiters;
74 }
75
76 public String getRemainder() {
77 return _header.substring(pos);
78 }
79
80 public Token next() throws ParseException {
81 return readToken();
82 }
83
84 public Token peek() throws ParseException {
85 int start = pos;
86 try {
87 return readToken();
88 } finally {
89 pos = start;
90 }
91 }
92
93 /**
94 * Read an ATOM token from the parsed header.
95 *
96 * @return A token containing the value of the atom token.
97 */
98 private Token readAtomicToken() {
99 // skip to next delimiter
100 int start = pos;
101 while (++pos < _header.length()) {
102 // break on the first non-atom character.
103 char ch = _header.charAt(pos);
104 if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) {
105 break;
106 }
107 }
108
109 return new Token(Token.ATOM, _header.substring(start, pos));
110 }
111
112 /**
113 * Read the next token from the header.
114 *
115 * @return The next token from the header. White space is skipped, and comment
116 * tokens are also skipped if indicated.
117 * @exception ParseException
118 */
119 private Token readToken() throws ParseException {
120 if (pos >= _header.length()) {
121 return EOF;
122 } else {
123 char c = _header.charAt(pos);
124 // comment token...read and skip over this
125 if (c == '(') {
126 Token comment = readComment();
127 if (_skip) {
128 return readToken();
129 } else {
130 return comment;
131 }
132 // quoted literal
133 } else if (c == '\"') {
134 return readQuotedString();
135 // white space, eat this and find a real token.
136 } else if (WHITE.indexOf(c) != -1) {
137 eatWhiteSpace();
138 return readToken();
139 // either a CTL or special. These characters have a self-defining token type.
140 } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) {
141 pos++;
142 return new Token((int)c, String.valueOf(c));
143 } else {
144 // start of an atom, parse it off.
145 return readAtomicToken();
146 }
147 }
148 }
149
150 /**
151 * Extract a substring from the header string and apply any
152 * escaping/folding rules to the string.
153 *
154 * @param start The starting offset in the header.
155 * @param end The header end offset + 1.
156 *
157 * @return The processed string value.
158 * @exception ParseException
159 */
160 private String getEscapedValue(int start, int end) throws ParseException {
161 StringBuffer value = new StringBuffer();
162
163 for (int i = start; i < end; i++) {
164 char ch = _header.charAt(i);
165 // is this an escape character?
166 if (ch == '\\') {
167 i++;
168 if (i == end) {
169 throw new ParseException("Invalid escape character");
170 }
171 value.append(_header.charAt(i));
172 }
173 // line breaks are ignored, except for naked '\n' characters, which are consider
174 // parts of linear whitespace.
175 else if (ch == '\r') {
176 // see if this is a CRLF sequence, and skip the second if it is.
177 if (i < end - 1 && _header.charAt(i + 1) == '\n') {
178 i++;
179 }
180 }
181 else {
182 // just append the ch value.
183 value.append(ch);
184 }
185 }
186 return value.toString();
187 }
188
189 /**
190 * Read a comment from the header, applying nesting and escape
191 * rules to the content.
192 *
193 * @return A comment token with the token value.
194 * @exception ParseException
195 */
196 private Token readComment() throws ParseException {
197 int start = pos + 1;
198 int nesting = 1;
199
200 boolean requiresEscaping = false;
201
202 // skip to end of comment/string
203 while (++pos < _header.length()) {
204 char ch = _header.charAt(pos);
205 if (ch == ')') {
206 nesting--;
207 if (nesting == 0) {
208 break;
209 }
210 }
211 else if (ch == '(') {
212 nesting++;
213 }
214 else if (ch == '\\') {
215 pos++;
216 requiresEscaping = true;
217 }
218 // we need to process line breaks also
219 else if (ch == '\r') {
220 requiresEscaping = true;
221 }
222 }
223
224 if (nesting != 0) {
225 throw new ParseException("Unbalanced comments");
226 }
227
228 String value;
229 if (requiresEscaping) {
230 value = getEscapedValue(start, pos);
231 }
232 else {
233 value = _header.substring(start, pos++);
234 }
235 return new Token(Token.COMMENT, value);
236 }
237
238 /**
239 * Parse out a quoted string from the header, applying escaping
240 * rules to the value.
241 *
242 * @return The QUOTEDSTRING token with the value.
243 * @exception ParseException
244 */
245 private Token readQuotedString() throws ParseException {
246 int start = pos+1;
247 boolean requiresEscaping = false;
248
249 // skip to end of comment/string
250 while (++pos < _header.length()) {
251 char ch = _header.charAt(pos);
252 if (ch == '"') {
253 String value;
254 if (requiresEscaping) {
255 value = getEscapedValue(start, pos);
256 }
257 else {
258 value = _header.substring(start, pos++);
259 }
260 return new Token(Token.QUOTEDSTRING, value);
261 }
262 else if (ch == '\\') {
263 pos++;
264 requiresEscaping = true;
265 }
266 // we need to process line breaks also
267 else if (ch == '\r') {
268 requiresEscaping = true;
269 }
270 }
271
272 throw new ParseException("Missing '\"'");
273 }
274
275 /**
276 * Skip white space in the token string.
277 */
278 private void eatWhiteSpace() {
279 // skip to end of whitespace
280 while (++pos < _header.length()
281 && WHITE.indexOf(_header.charAt(pos)) != -1)
282 ;
283 }
284 }