001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied. See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019
020 package javax.mail.internet;
021
022 /**
023 * @version $Rev: 729233 $ $Date: 2008-12-24 00:08:45 -0500 (Wed, 24 Dec 2008) $
024 */
025 public class HeaderTokenizer {
026 public static class Token {
027 // Constant values from J2SE 1.4 API Docs (Constant values)
028 public static final int ATOM = -1;
029 public static final int COMMENT = -3;
030 public static final int EOF = -4;
031 public static final int QUOTEDSTRING = -2;
032 private int _type;
033 private String _value;
034
035 public Token(int type, String value) {
036 _type = type;
037 _value = value;
038 }
039
040 public int getType() {
041 return _type;
042 }
043
044 public String getValue() {
045 return _value;
046 }
047 }
048
049 private static final Token EOF = new Token(Token.EOF, null);
050 // characters not allowed in MIME
051 public static final String MIME = "()<>@,;:\\\"\t []/?=";
052 // charaters not allowed in RFC822
053 public static final String RFC822 = "()<>@,;:\\\"\t .[]";
054 private static final String WHITE = " \t\n\r";
055 private String _delimiters;
056 private String _header;
057 private boolean _skip;
058 private int pos;
059
060 public HeaderTokenizer(String header) {
061 this(header, RFC822);
062 }
063
064 public HeaderTokenizer(String header, String delimiters) {
065 this(header, delimiters, true);
066 }
067
068 public HeaderTokenizer(String header,
069 String delimiters,
070 boolean skipComments) {
071 _skip = skipComments;
072 _header = header;
073 _delimiters = delimiters;
074 }
075
076 public String getRemainder() {
077 return _header.substring(pos);
078 }
079
080 public Token next() throws ParseException {
081 return readToken();
082 }
083
084 public Token peek() throws ParseException {
085 int start = pos;
086 try {
087 return readToken();
088 } finally {
089 pos = start;
090 }
091 }
092
093 /**
094 * Read an ATOM token from the parsed header.
095 *
096 * @return A token containing the value of the atom token.
097 */
098 private Token readAtomicToken() {
099 // skip to next delimiter
100 int start = pos;
101 while (++pos < _header.length()) {
102 // break on the first non-atom character.
103 char ch = _header.charAt(pos);
104 if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) {
105 break;
106 }
107 }
108
109 return new Token(Token.ATOM, _header.substring(start, pos));
110 }
111
112 /**
113 * Read the next token from the header.
114 *
115 * @return The next token from the header. White space is skipped, and comment
116 * tokens are also skipped if indicated.
117 * @exception ParseException
118 */
119 private Token readToken() throws ParseException {
120 if (pos >= _header.length()) {
121 return EOF;
122 } else {
123 char c = _header.charAt(pos);
124 // comment token...read and skip over this
125 if (c == '(') {
126 Token comment = readComment();
127 if (_skip) {
128 return readToken();
129 } else {
130 return comment;
131 }
132 // quoted literal
133 } else if (c == '\"') {
134 return readQuotedString();
135 // white space, eat this and find a real token.
136 } else if (WHITE.indexOf(c) != -1) {
137 eatWhiteSpace();
138 return readToken();
139 // either a CTL or special. These characters have a self-defining token type.
140 } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) {
141 pos++;
142 return new Token((int)c, String.valueOf(c));
143 } else {
144 // start of an atom, parse it off.
145 return readAtomicToken();
146 }
147 }
148 }
149
150 /**
151 * Extract a substring from the header string and apply any
152 * escaping/folding rules to the string.
153 *
154 * @param start The starting offset in the header.
155 * @param end The header end offset + 1.
156 *
157 * @return The processed string value.
158 * @exception ParseException
159 */
160 private String getEscapedValue(int start, int end) throws ParseException {
161 StringBuffer value = new StringBuffer();
162
163 for (int i = start; i < end; i++) {
164 char ch = _header.charAt(i);
165 // is this an escape character?
166 if (ch == '\\') {
167 i++;
168 if (i == end) {
169 throw new ParseException("Invalid escape character");
170 }
171 value.append(_header.charAt(i));
172 }
173 // line breaks are ignored, except for naked '\n' characters, which are consider
174 // parts of linear whitespace.
175 else if (ch == '\r') {
176 // see if this is a CRLF sequence, and skip the second if it is.
177 if (i < end - 1 && _header.charAt(i + 1) == '\n') {
178 i++;
179 }
180 }
181 else {
182 // just append the ch value.
183 value.append(ch);
184 }
185 }
186 return value.toString();
187 }
188
189 /**
190 * Read a comment from the header, applying nesting and escape
191 * rules to the content.
192 *
193 * @return A comment token with the token value.
194 * @exception ParseException
195 */
196 private Token readComment() throws ParseException {
197 int start = pos + 1;
198 int nesting = 1;
199
200 boolean requiresEscaping = false;
201
202 // skip to end of comment/string
203 while (++pos < _header.length()) {
204 char ch = _header.charAt(pos);
205 if (ch == ')') {
206 nesting--;
207 if (nesting == 0) {
208 break;
209 }
210 }
211 else if (ch == '(') {
212 nesting++;
213 }
214 else if (ch == '\\') {
215 pos++;
216 requiresEscaping = true;
217 }
218 // we need to process line breaks also
219 else if (ch == '\r') {
220 requiresEscaping = true;
221 }
222 }
223
224 if (nesting != 0) {
225 throw new ParseException("Unbalanced comments");
226 }
227
228 String value;
229 if (requiresEscaping) {
230 value = getEscapedValue(start, pos);
231 }
232 else {
233 value = _header.substring(start, pos++);
234 }
235 return new Token(Token.COMMENT, value);
236 }
237
238 /**
239 * Parse out a quoted string from the header, applying escaping
240 * rules to the value.
241 *
242 * @return The QUOTEDSTRING token with the value.
243 * @exception ParseException
244 */
245 private Token readQuotedString() throws ParseException {
246 int start = pos+1;
247 boolean requiresEscaping = false;
248
249 // skip to end of comment/string
250 while (++pos < _header.length()) {
251 char ch = _header.charAt(pos);
252 if (ch == '"') {
253 String value;
254 if (requiresEscaping) {
255 value = getEscapedValue(start, pos++);
256 }
257 else {
258 value = _header.substring(start, pos++);
259 }
260 return new Token(Token.QUOTEDSTRING, value);
261 }
262 else if (ch == '\\') {
263 pos++;
264 requiresEscaping = true;
265 }
266 // we need to process line breaks also
267 else if (ch == '\r') {
268 requiresEscaping = true;
269 }
270 }
271
272 throw new ParseException("Missing '\"'");
273 }
274
275 /**
276 * Skip white space in the token string.
277 */
278 private void eatWhiteSpace() {
279 // skip to end of whitespace
280 while (++pos < _header.length()
281 && WHITE.indexOf(_header.charAt(pos)) != -1)
282 ;
283 }
284 }