001    /**
002     *
003     * Copyright 2003-2004 The Apache Software Foundation
004     *
005     *  Licensed under the Apache License, Version 2.0 (the "License");
006     *  you may not use this file except in compliance with the License.
007     *  You may obtain a copy of the License at
008     *
009     *     http://www.apache.org/licenses/LICENSE-2.0
010     *
011     *  Unless required by applicable law or agreed to in writing, software
012     *  distributed under the License is distributed on an "AS IS" BASIS,
013     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     *  See the License for the specific language governing permissions and
015     *  limitations under the License.
016     */
017    
018    package javax.mail.internet;
019    
020    /**
021     * @version $Rev: 381393 $ $Date: 2006-02-27 09:38:03 -0800 (Mon, 27 Feb 2006) $
022     */
023    public class HeaderTokenizer {
024        public static class Token {
025            // Constant values from J2SE 1.4 API Docs (Constant values)
026            public static final int ATOM = -1;
027            public static final int COMMENT = -3;
028            public static final int EOF = -4;
029            public static final int QUOTEDSTRING = -2;
030            private int _type;
031            private String _value;
032    
033            public Token(int type, String value) {
034                _type = type;
035                _value = value;
036            }
037    
038            public int getType() {
039                return _type;
040            }
041    
042            public String getValue() {
043                return _value;
044            }
045        }
046    
047        private static final Token EOF = new Token(Token.EOF, null);
048        // characters not allowed in MIME
049        public static final String MIME = "()<>@,;:\\\"\t []/?=";
050        // charaters not allowed in RFC822
051        public static final String RFC822 = "()<>@,;:\\\"\t .[]";
052        private static final String WHITE = " \t\n\r";
053        private String _delimiters;
054        private String _header;
055        private boolean _skip;
056        private int pos;
057    
058        public HeaderTokenizer(String header) {
059            this(header, RFC822);
060        }
061    
062        public HeaderTokenizer(String header, String delimiters) {
063            this(header, delimiters, true);
064        }
065    
066        public HeaderTokenizer(String header,
067                               String delimiters,
068                               boolean skipComments) {
069            _skip = skipComments;
070            _header = header;
071            _delimiters = delimiters;
072        }
073    
074        public String getRemainder() {
075            return _header.substring(pos);
076        }
077    
078        public Token next() throws ParseException {
079            return readToken();
080        }
081    
082        public Token peek() throws ParseException {
083            int start = pos;
084            try {
085                return readToken();
086            } finally {
087                pos = start;
088            }
089        }
090    
091        /**
092         * Read an ATOM token from the parsed header.
093         *
094         * @return A token containing the value of the atom token.
095         */
096        private Token readAtomicToken() {
097            // skip to next delimiter
098            int start = pos;
099            while (++pos < _header.length()) {
100                // break on the first non-atom character.
101                char ch = _header.charAt(pos);
102                if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) {
103                    break;
104                }
105            }
106    
107            return new Token(Token.ATOM, _header.substring(start, pos));
108        }
109    
110        /**
111         * Read the next token from the header.
112         *
113         * @return The next token from the header.  White space is skipped, and comment
114         *         tokens are also skipped if indicated.
115         * @exception ParseException
116         */
117        private Token readToken() throws ParseException {
118            if (pos >= _header.length()) {
119                return EOF;
120            } else {
121                char c = _header.charAt(pos);
122                // comment token...read and skip over this
123                if (c == '(') {
124                    Token comment = readComment();
125                    if (_skip) {
126                        return readToken();
127                    } else {
128                        return comment;
129                    }
130                    // quoted literal
131                } else if (c == '\"') {
132                    return readQuotedString();
133                // white space, eat this and find a real token.
134                } else if (WHITE.indexOf(c) != -1) {
135                    eatWhiteSpace();
136                    return readToken();
137                // either a CTL or special.  These characters have a self-defining token type.
138                } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) {
139                    pos++;
140                    return new Token((int)c, String.valueOf(c));
141                } else {
142                    // start of an atom, parse it off.
143                    return readAtomicToken();
144                }
145            }
146        }
147    
148        /**
149         * Extract a substring from the header string and apply any
150         * escaping/folding rules to the string.
151         *
152         * @param start  The starting offset in the header.
153         * @param end    The header end offset + 1.
154         *
155         * @return The processed string value.
156         * @exception ParseException
157         */
158        private String getEscapedValue(int start, int end) throws ParseException {
159            StringBuffer value = new StringBuffer();
160    
161            for (int i = start; i < end; i++) {
162                char ch = _header.charAt(i);
163                // is this an escape character?
164                if (ch == '\\') {
165                    i++;
166                    if (i == end) {
167                        throw new ParseException("Invalid escape character");
168                    }
169                    value.append(_header.charAt(i));
170                }
171                // line breaks are ignored, except for naked '\n' characters, which are consider
172                // parts of linear whitespace.
173                else if (ch == '\r') {
174                    // see if this is a CRLF sequence, and skip the second if it is.
175                    if (i < end - 1 && _header.charAt(i + 1) == '\n') {
176                        i++;
177                    }
178                }
179                else {
180                    // just append the ch value.
181                    value.append(ch);
182                }
183            }
184            return value.toString();
185        }
186    
187        /**
188         * Read a comment from the header, applying nesting and escape
189         * rules to the content.
190         *
191         * @return A comment token with the token value.
192         * @exception ParseException
193         */
194        private Token readComment() throws ParseException {
195            int start = pos + 1;
196            int nesting = 1;
197    
198            boolean requiresEscaping = false;
199    
200            // skip to end of comment/string
201            while (++pos < _header.length()) {
202                char ch = _header.charAt(pos);
203                if (ch == ')') {
204                    nesting--;
205                    if (nesting == 0) {
206                        break;
207                    }
208                }
209                else if (ch == '(') {
210                    nesting++;
211                }
212                else if (ch == '\\') {
213                    pos++;
214                    requiresEscaping = true;
215                }
216                // we need to process line breaks also
217                else if (ch == '\r') {
218                    requiresEscaping = true;
219                }
220            }
221    
222            if (nesting != 0) {
223                throw new ParseException("Unbalanced comments");
224            }
225    
226            String value;
227            if (requiresEscaping) {
228                value = getEscapedValue(start, pos);
229            }
230            else {
231                value = _header.substring(start, pos++);
232            }
233            return new Token(Token.COMMENT, value);
234        }
235    
236        /**
237         * Parse out a quoted string from the header, applying escaping
238         * rules to the value.
239         *
240         * @return The QUOTEDSTRING token with the value.
241         * @exception ParseException
242         */
243        private Token readQuotedString() throws ParseException {
244            int start = pos+1;
245            boolean requiresEscaping = false;
246    
247            // skip to end of comment/string
248            while (++pos < _header.length()) {
249                char ch = _header.charAt(pos);
250                if (ch == '"') {
251                    String value;
252                    if (requiresEscaping) {
253                        value = getEscapedValue(start, pos);
254                    }
255                    else {
256                        value = _header.substring(start, pos++);
257                    }
258                    return new Token(Token.QUOTEDSTRING, value);
259                }
260                else if (ch == '\\') {
261                    pos++;
262                    requiresEscaping = true;
263                }
264                // we need to process line breaks also
265                else if (ch == '\r') {
266                    requiresEscaping = true;
267                }
268            }
269    
270            throw new ParseException("Missing '\"'");
271        }
272    
273        /**
274         * Skip white space in the token string.
275         */
276        private void eatWhiteSpace() {
277            // skip to end of whitespace
278            while (++pos < _header.length()
279                    && WHITE.indexOf(_header.charAt(pos)) != -1)
280                ;
281        }
282    }