001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019 020 package javax.mail.internet; 021 022 /** 023 * @version $Rev: 729233 $ $Date: 2008-12-24 00:08:45 -0500 (Wed, 24 Dec 2008) $ 024 */ 025 public class HeaderTokenizer { 026 public static class Token { 027 // Constant values from J2SE 1.4 API Docs (Constant values) 028 public static final int ATOM = -1; 029 public static final int COMMENT = -3; 030 public static final int EOF = -4; 031 public static final int QUOTEDSTRING = -2; 032 private int _type; 033 private String _value; 034 035 public Token(int type, String value) { 036 _type = type; 037 _value = value; 038 } 039 040 public int getType() { 041 return _type; 042 } 043 044 public String getValue() { 045 return _value; 046 } 047 } 048 049 private static final Token EOF = new Token(Token.EOF, null); 050 // characters not allowed in MIME 051 public static final String MIME = "()<>@,;:\\\"\t []/?="; 052 // charaters not allowed in RFC822 053 public static final String RFC822 = "()<>@,;:\\\"\t .[]"; 054 private static final String WHITE = " \t\n\r"; 055 private String _delimiters; 056 private String _header; 057 private boolean _skip; 058 private int pos; 059 060 public HeaderTokenizer(String header) { 061 this(header, RFC822); 062 } 063 064 public HeaderTokenizer(String header, String delimiters) { 065 this(header, delimiters, true); 066 } 067 068 public HeaderTokenizer(String header, 069 String delimiters, 070 boolean skipComments) { 071 _skip = skipComments; 072 _header = header; 073 _delimiters = delimiters; 074 } 075 076 public String getRemainder() { 077 return _header.substring(pos); 078 } 079 080 public Token next() throws ParseException { 081 return readToken(); 082 } 083 084 public Token peek() throws ParseException { 085 int start = pos; 086 try { 087 return readToken(); 088 } finally { 089 pos = start; 090 } 091 } 092 093 /** 094 * Read an ATOM token from the parsed header. 095 * 096 * @return A token containing the value of the atom token. 097 */ 098 private Token readAtomicToken() { 099 // skip to next delimiter 100 int start = pos; 101 while (++pos < _header.length()) { 102 // break on the first non-atom character. 103 char ch = _header.charAt(pos); 104 if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) { 105 break; 106 } 107 } 108 109 return new Token(Token.ATOM, _header.substring(start, pos)); 110 } 111 112 /** 113 * Read the next token from the header. 114 * 115 * @return The next token from the header. White space is skipped, and comment 116 * tokens are also skipped if indicated. 117 * @exception ParseException 118 */ 119 private Token readToken() throws ParseException { 120 if (pos >= _header.length()) { 121 return EOF; 122 } else { 123 char c = _header.charAt(pos); 124 // comment token...read and skip over this 125 if (c == '(') { 126 Token comment = readComment(); 127 if (_skip) { 128 return readToken(); 129 } else { 130 return comment; 131 } 132 // quoted literal 133 } else if (c == '\"') { 134 return readQuotedString(); 135 // white space, eat this and find a real token. 136 } else if (WHITE.indexOf(c) != -1) { 137 eatWhiteSpace(); 138 return readToken(); 139 // either a CTL or special. These characters have a self-defining token type. 140 } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) { 141 pos++; 142 return new Token((int)c, String.valueOf(c)); 143 } else { 144 // start of an atom, parse it off. 145 return readAtomicToken(); 146 } 147 } 148 } 149 150 /** 151 * Extract a substring from the header string and apply any 152 * escaping/folding rules to the string. 153 * 154 * @param start The starting offset in the header. 155 * @param end The header end offset + 1. 156 * 157 * @return The processed string value. 158 * @exception ParseException 159 */ 160 private String getEscapedValue(int start, int end) throws ParseException { 161 StringBuffer value = new StringBuffer(); 162 163 for (int i = start; i < end; i++) { 164 char ch = _header.charAt(i); 165 // is this an escape character? 166 if (ch == '\\') { 167 i++; 168 if (i == end) { 169 throw new ParseException("Invalid escape character"); 170 } 171 value.append(_header.charAt(i)); 172 } 173 // line breaks are ignored, except for naked '\n' characters, which are consider 174 // parts of linear whitespace. 175 else if (ch == '\r') { 176 // see if this is a CRLF sequence, and skip the second if it is. 177 if (i < end - 1 && _header.charAt(i + 1) == '\n') { 178 i++; 179 } 180 } 181 else { 182 // just append the ch value. 183 value.append(ch); 184 } 185 } 186 return value.toString(); 187 } 188 189 /** 190 * Read a comment from the header, applying nesting and escape 191 * rules to the content. 192 * 193 * @return A comment token with the token value. 194 * @exception ParseException 195 */ 196 private Token readComment() throws ParseException { 197 int start = pos + 1; 198 int nesting = 1; 199 200 boolean requiresEscaping = false; 201 202 // skip to end of comment/string 203 while (++pos < _header.length()) { 204 char ch = _header.charAt(pos); 205 if (ch == ')') { 206 nesting--; 207 if (nesting == 0) { 208 break; 209 } 210 } 211 else if (ch == '(') { 212 nesting++; 213 } 214 else if (ch == '\\') { 215 pos++; 216 requiresEscaping = true; 217 } 218 // we need to process line breaks also 219 else if (ch == '\r') { 220 requiresEscaping = true; 221 } 222 } 223 224 if (nesting != 0) { 225 throw new ParseException("Unbalanced comments"); 226 } 227 228 String value; 229 if (requiresEscaping) { 230 value = getEscapedValue(start, pos); 231 } 232 else { 233 value = _header.substring(start, pos++); 234 } 235 return new Token(Token.COMMENT, value); 236 } 237 238 /** 239 * Parse out a quoted string from the header, applying escaping 240 * rules to the value. 241 * 242 * @return The QUOTEDSTRING token with the value. 243 * @exception ParseException 244 */ 245 private Token readQuotedString() throws ParseException { 246 int start = pos+1; 247 boolean requiresEscaping = false; 248 249 // skip to end of comment/string 250 while (++pos < _header.length()) { 251 char ch = _header.charAt(pos); 252 if (ch == '"') { 253 String value; 254 if (requiresEscaping) { 255 value = getEscapedValue(start, pos++); 256 } 257 else { 258 value = _header.substring(start, pos++); 259 } 260 return new Token(Token.QUOTEDSTRING, value); 261 } 262 else if (ch == '\\') { 263 pos++; 264 requiresEscaping = true; 265 } 266 // we need to process line breaks also 267 else if (ch == '\r') { 268 requiresEscaping = true; 269 } 270 } 271 272 throw new ParseException("Missing '\"'"); 273 } 274 275 /** 276 * Skip white space in the token string. 277 */ 278 private void eatWhiteSpace() { 279 // skip to end of whitespace 280 while (++pos < _header.length() 281 && WHITE.indexOf(_header.charAt(pos)) != -1) 282 ; 283 } 284 }