001 /** 002 * 003 * Copyright 2003-2006 The Apache Software Foundation 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package javax.mail.internet; 019 020 /** 021 * @version $Rev: 421852 $ $Date: 2006-07-14 03:02:19 -0700 (Fri, 14 Jul 2006) $ 022 */ 023 public class HeaderTokenizer { 024 public static class Token { 025 // Constant values from J2SE 1.4 API Docs (Constant values) 026 public static final int ATOM = -1; 027 public static final int COMMENT = -3; 028 public static final int EOF = -4; 029 public static final int QUOTEDSTRING = -2; 030 private int _type; 031 private String _value; 032 033 public Token(int type, String value) { 034 _type = type; 035 _value = value; 036 } 037 038 public int getType() { 039 return _type; 040 } 041 042 public String getValue() { 043 return _value; 044 } 045 } 046 047 private static final Token EOF = new Token(Token.EOF, null); 048 // characters not allowed in MIME 049 public static final String MIME = "()<>@,;:\\\"\t []/?="; 050 // charaters not allowed in RFC822 051 public static final String RFC822 = "()<>@,;:\\\"\t .[]"; 052 private static final String WHITE = " \t\n\r"; 053 private String _delimiters; 054 private String _header; 055 private boolean _skip; 056 private int pos; 057 058 public HeaderTokenizer(String header) { 059 this(header, RFC822); 060 } 061 062 public HeaderTokenizer(String header, String delimiters) { 063 this(header, delimiters, true); 064 } 065 066 public HeaderTokenizer(String header, 067 String delimiters, 068 boolean skipComments) { 069 _skip = skipComments; 070 _header = header; 071 _delimiters = delimiters; 072 } 073 074 public String getRemainder() { 075 return _header.substring(pos); 076 } 077 078 public Token next() throws ParseException { 079 return readToken(); 080 } 081 082 public Token peek() throws ParseException { 083 int start = pos; 084 try { 085 return readToken(); 086 } finally { 087 pos = start; 088 } 089 } 090 091 /** 092 * Read an ATOM token from the parsed header. 093 * 094 * @return A token containing the value of the atom token. 095 */ 096 private Token readAtomicToken() { 097 // skip to next delimiter 098 int start = pos; 099 while (++pos < _header.length()) { 100 // break on the first non-atom character. 101 char ch = _header.charAt(pos); 102 if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) { 103 break; 104 } 105 } 106 107 return new Token(Token.ATOM, _header.substring(start, pos)); 108 } 109 110 /** 111 * Read the next token from the header. 112 * 113 * @return The next token from the header. White space is skipped, and comment 114 * tokens are also skipped if indicated. 115 * @exception ParseException 116 */ 117 private Token readToken() throws ParseException { 118 if (pos >= _header.length()) { 119 return EOF; 120 } else { 121 char c = _header.charAt(pos); 122 // comment token...read and skip over this 123 if (c == '(') { 124 Token comment = readComment(); 125 if (_skip) { 126 return readToken(); 127 } else { 128 return comment; 129 } 130 // quoted literal 131 } else if (c == '\"') { 132 return readQuotedString(); 133 // white space, eat this and find a real token. 134 } else if (WHITE.indexOf(c) != -1) { 135 eatWhiteSpace(); 136 return readToken(); 137 // either a CTL or special. These characters have a self-defining token type. 138 } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) { 139 pos++; 140 return new Token((int)c, String.valueOf(c)); 141 } else { 142 // start of an atom, parse it off. 143 return readAtomicToken(); 144 } 145 } 146 } 147 148 /** 149 * Extract a substring from the header string and apply any 150 * escaping/folding rules to the string. 151 * 152 * @param start The starting offset in the header. 153 * @param end The header end offset + 1. 154 * 155 * @return The processed string value. 156 * @exception ParseException 157 */ 158 private String getEscapedValue(int start, int end) throws ParseException { 159 StringBuffer value = new StringBuffer(); 160 161 for (int i = start; i < end; i++) { 162 char ch = _header.charAt(i); 163 // is this an escape character? 164 if (ch == '\\') { 165 i++; 166 if (i == end) { 167 throw new ParseException("Invalid escape character"); 168 } 169 value.append(_header.charAt(i)); 170 } 171 // line breaks are ignored, except for naked '\n' characters, which are consider 172 // parts of linear whitespace. 173 else if (ch == '\r') { 174 // see if this is a CRLF sequence, and skip the second if it is. 175 if (i < end - 1 && _header.charAt(i + 1) == '\n') { 176 i++; 177 } 178 } 179 else { 180 // just append the ch value. 181 value.append(ch); 182 } 183 } 184 return value.toString(); 185 } 186 187 /** 188 * Read a comment from the header, applying nesting and escape 189 * rules to the content. 190 * 191 * @return A comment token with the token value. 192 * @exception ParseException 193 */ 194 private Token readComment() throws ParseException { 195 int start = pos + 1; 196 int nesting = 1; 197 198 boolean requiresEscaping = false; 199 200 // skip to end of comment/string 201 while (++pos < _header.length()) { 202 char ch = _header.charAt(pos); 203 if (ch == ')') { 204 nesting--; 205 if (nesting == 0) { 206 break; 207 } 208 } 209 else if (ch == '(') { 210 nesting++; 211 } 212 else if (ch == '\\') { 213 pos++; 214 requiresEscaping = true; 215 } 216 // we need to process line breaks also 217 else if (ch == '\r') { 218 requiresEscaping = true; 219 } 220 } 221 222 if (nesting != 0) { 223 throw new ParseException("Unbalanced comments"); 224 } 225 226 String value; 227 if (requiresEscaping) { 228 value = getEscapedValue(start, pos); 229 } 230 else { 231 value = _header.substring(start, pos++); 232 } 233 return new Token(Token.COMMENT, value); 234 } 235 236 /** 237 * Parse out a quoted string from the header, applying escaping 238 * rules to the value. 239 * 240 * @return The QUOTEDSTRING token with the value. 241 * @exception ParseException 242 */ 243 private Token readQuotedString() throws ParseException { 244 int start = pos+1; 245 boolean requiresEscaping = false; 246 247 // skip to end of comment/string 248 while (++pos < _header.length()) { 249 char ch = _header.charAt(pos); 250 if (ch == '"') { 251 String value; 252 if (requiresEscaping) { 253 value = getEscapedValue(start, pos); 254 } 255 else { 256 value = _header.substring(start, pos++); 257 } 258 return new Token(Token.QUOTEDSTRING, value); 259 } 260 else if (ch == '\\') { 261 pos++; 262 requiresEscaping = true; 263 } 264 // we need to process line breaks also 265 else if (ch == '\r') { 266 requiresEscaping = true; 267 } 268 } 269 270 throw new ParseException("Missing '\"'"); 271 } 272 273 /** 274 * Skip white space in the token string. 275 */ 276 private void eatWhiteSpace() { 277 // skip to end of whitespace 278 while (++pos < _header.length() 279 && WHITE.indexOf(_header.charAt(pos)) != -1) 280 ; 281 } 282 }