001 /**
002 *
003 * Copyright 2003-2006 The Apache Software Foundation
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package javax.mail.internet;
019
020 /**
021 * @version $Rev: 421852 $ $Date: 2006-07-14 03:02:19 -0700 (Fri, 14 Jul 2006) $
022 */
023 public class HeaderTokenizer {
024 public static class Token {
025 // Constant values from J2SE 1.4 API Docs (Constant values)
026 public static final int ATOM = -1;
027 public static final int COMMENT = -3;
028 public static final int EOF = -4;
029 public static final int QUOTEDSTRING = -2;
030 private int _type;
031 private String _value;
032
033 public Token(int type, String value) {
034 _type = type;
035 _value = value;
036 }
037
038 public int getType() {
039 return _type;
040 }
041
042 public String getValue() {
043 return _value;
044 }
045 }
046
047 private static final Token EOF = new Token(Token.EOF, null);
048 // characters not allowed in MIME
049 public static final String MIME = "()<>@,;:\\\"\t []/?=";
050 // charaters not allowed in RFC822
051 public static final String RFC822 = "()<>@,;:\\\"\t .[]";
052 private static final String WHITE = " \t\n\r";
053 private String _delimiters;
054 private String _header;
055 private boolean _skip;
056 private int pos;
057
058 public HeaderTokenizer(String header) {
059 this(header, RFC822);
060 }
061
062 public HeaderTokenizer(String header, String delimiters) {
063 this(header, delimiters, true);
064 }
065
066 public HeaderTokenizer(String header,
067 String delimiters,
068 boolean skipComments) {
069 _skip = skipComments;
070 _header = header;
071 _delimiters = delimiters;
072 }
073
074 public String getRemainder() {
075 return _header.substring(pos);
076 }
077
078 public Token next() throws ParseException {
079 return readToken();
080 }
081
082 public Token peek() throws ParseException {
083 int start = pos;
084 try {
085 return readToken();
086 } finally {
087 pos = start;
088 }
089 }
090
091 /**
092 * Read an ATOM token from the parsed header.
093 *
094 * @return A token containing the value of the atom token.
095 */
096 private Token readAtomicToken() {
097 // skip to next delimiter
098 int start = pos;
099 while (++pos < _header.length()) {
100 // break on the first non-atom character.
101 char ch = _header.charAt(pos);
102 if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) {
103 break;
104 }
105 }
106
107 return new Token(Token.ATOM, _header.substring(start, pos));
108 }
109
110 /**
111 * Read the next token from the header.
112 *
113 * @return The next token from the header. White space is skipped, and comment
114 * tokens are also skipped if indicated.
115 * @exception ParseException
116 */
117 private Token readToken() throws ParseException {
118 if (pos >= _header.length()) {
119 return EOF;
120 } else {
121 char c = _header.charAt(pos);
122 // comment token...read and skip over this
123 if (c == '(') {
124 Token comment = readComment();
125 if (_skip) {
126 return readToken();
127 } else {
128 return comment;
129 }
130 // quoted literal
131 } else if (c == '\"') {
132 return readQuotedString();
133 // white space, eat this and find a real token.
134 } else if (WHITE.indexOf(c) != -1) {
135 eatWhiteSpace();
136 return readToken();
137 // either a CTL or special. These characters have a self-defining token type.
138 } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) {
139 pos++;
140 return new Token((int)c, String.valueOf(c));
141 } else {
142 // start of an atom, parse it off.
143 return readAtomicToken();
144 }
145 }
146 }
147
148 /**
149 * Extract a substring from the header string and apply any
150 * escaping/folding rules to the string.
151 *
152 * @param start The starting offset in the header.
153 * @param end The header end offset + 1.
154 *
155 * @return The processed string value.
156 * @exception ParseException
157 */
158 private String getEscapedValue(int start, int end) throws ParseException {
159 StringBuffer value = new StringBuffer();
160
161 for (int i = start; i < end; i++) {
162 char ch = _header.charAt(i);
163 // is this an escape character?
164 if (ch == '\\') {
165 i++;
166 if (i == end) {
167 throw new ParseException("Invalid escape character");
168 }
169 value.append(_header.charAt(i));
170 }
171 // line breaks are ignored, except for naked '\n' characters, which are consider
172 // parts of linear whitespace.
173 else if (ch == '\r') {
174 // see if this is a CRLF sequence, and skip the second if it is.
175 if (i < end - 1 && _header.charAt(i + 1) == '\n') {
176 i++;
177 }
178 }
179 else {
180 // just append the ch value.
181 value.append(ch);
182 }
183 }
184 return value.toString();
185 }
186
187 /**
188 * Read a comment from the header, applying nesting and escape
189 * rules to the content.
190 *
191 * @return A comment token with the token value.
192 * @exception ParseException
193 */
194 private Token readComment() throws ParseException {
195 int start = pos + 1;
196 int nesting = 1;
197
198 boolean requiresEscaping = false;
199
200 // skip to end of comment/string
201 while (++pos < _header.length()) {
202 char ch = _header.charAt(pos);
203 if (ch == ')') {
204 nesting--;
205 if (nesting == 0) {
206 break;
207 }
208 }
209 else if (ch == '(') {
210 nesting++;
211 }
212 else if (ch == '\\') {
213 pos++;
214 requiresEscaping = true;
215 }
216 // we need to process line breaks also
217 else if (ch == '\r') {
218 requiresEscaping = true;
219 }
220 }
221
222 if (nesting != 0) {
223 throw new ParseException("Unbalanced comments");
224 }
225
226 String value;
227 if (requiresEscaping) {
228 value = getEscapedValue(start, pos);
229 }
230 else {
231 value = _header.substring(start, pos++);
232 }
233 return new Token(Token.COMMENT, value);
234 }
235
236 /**
237 * Parse out a quoted string from the header, applying escaping
238 * rules to the value.
239 *
240 * @return The QUOTEDSTRING token with the value.
241 * @exception ParseException
242 */
243 private Token readQuotedString() throws ParseException {
244 int start = pos+1;
245 boolean requiresEscaping = false;
246
247 // skip to end of comment/string
248 while (++pos < _header.length()) {
249 char ch = _header.charAt(pos);
250 if (ch == '"') {
251 String value;
252 if (requiresEscaping) {
253 value = getEscapedValue(start, pos);
254 }
255 else {
256 value = _header.substring(start, pos++);
257 }
258 return new Token(Token.QUOTEDSTRING, value);
259 }
260 else if (ch == '\\') {
261 pos++;
262 requiresEscaping = true;
263 }
264 // we need to process line breaks also
265 else if (ch == '\r') {
266 requiresEscaping = true;
267 }
268 }
269
270 throw new ParseException("Missing '\"'");
271 }
272
273 /**
274 * Skip white space in the token string.
275 */
276 private void eatWhiteSpace() {
277 // skip to end of whitespace
278 while (++pos < _header.length()
279 && WHITE.indexOf(_header.charAt(pos)) != -1)
280 ;
281 }
282 }