001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *  http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing,
013     * software distributed under the License is distributed on an
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015     * KIND, either express or implied.  See the License for the
016     * specific language governing permissions and limitations
017     * under the License.
018     */
019    
020    package javax.mail.internet;
021    
022    /**
023     * @version $Rev: 729233 $ $Date: 2008-12-24 00:08:45 -0500 (Wed, 24 Dec 2008) $
024     */
025    public class HeaderTokenizer {
026        public static class Token {
027            // Constant values from J2SE 1.4 API Docs (Constant values)
028            public static final int ATOM = -1;
029            public static final int COMMENT = -3;
030            public static final int EOF = -4;
031            public static final int QUOTEDSTRING = -2;
032            private int _type;
033            private String _value;
034    
035            public Token(int type, String value) {
036                _type = type;
037                _value = value;
038            }
039    
040            public int getType() {
041                return _type;
042            }
043    
044            public String getValue() {
045                return _value;
046            }
047        }
048    
049        private static final Token EOF = new Token(Token.EOF, null);
050        // characters not allowed in MIME
051        public static final String MIME = "()<>@,;:\\\"\t []/?=";
052        // charaters not allowed in RFC822
053        public static final String RFC822 = "()<>@,;:\\\"\t .[]";
054        private static final String WHITE = " \t\n\r";
055        private String _delimiters;
056        private String _header;
057        private boolean _skip;
058        private int pos;
059    
060        public HeaderTokenizer(String header) {
061            this(header, RFC822);
062        }
063    
064        public HeaderTokenizer(String header, String delimiters) {
065            this(header, delimiters, true);
066        }
067    
068        public HeaderTokenizer(String header,
069                               String delimiters,
070                               boolean skipComments) {
071            _skip = skipComments;
072            _header = header;
073            _delimiters = delimiters;
074        }
075    
076        public String getRemainder() {
077            return _header.substring(pos);
078        }
079    
080        public Token next() throws ParseException {
081            return readToken();
082        }
083    
084        public Token peek() throws ParseException {
085            int start = pos;
086            try {
087                return readToken();
088            } finally {
089                pos = start;
090            }
091        }
092    
093        /**
094         * Read an ATOM token from the parsed header.
095         *
096         * @return A token containing the value of the atom token.
097         */
098        private Token readAtomicToken() {
099            // skip to next delimiter
100            int start = pos;
101            while (++pos < _header.length()) {
102                // break on the first non-atom character.
103                char ch = _header.charAt(pos);
104                if (_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127) {
105                    break;
106                }
107            }
108    
109            return new Token(Token.ATOM, _header.substring(start, pos));
110        }
111    
112        /**
113         * Read the next token from the header.
114         *
115         * @return The next token from the header.  White space is skipped, and comment
116         *         tokens are also skipped if indicated.
117         * @exception ParseException
118         */
119        private Token readToken() throws ParseException {
120            if (pos >= _header.length()) {
121                return EOF;
122            } else {
123                char c = _header.charAt(pos);
124                // comment token...read and skip over this
125                if (c == '(') {
126                    Token comment = readComment();
127                    if (_skip) {
128                        return readToken();
129                    } else {
130                        return comment;
131                    }
132                    // quoted literal
133                } else if (c == '\"') {
134                    return readQuotedString();
135                // white space, eat this and find a real token.
136                } else if (WHITE.indexOf(c) != -1) {
137                    eatWhiteSpace();
138                    return readToken();
139                // either a CTL or special.  These characters have a self-defining token type.
140                } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) {
141                    pos++;
142                    return new Token((int)c, String.valueOf(c));
143                } else {
144                    // start of an atom, parse it off.
145                    return readAtomicToken();
146                }
147            }
148        }
149    
150        /**
151         * Extract a substring from the header string and apply any
152         * escaping/folding rules to the string.
153         *
154         * @param start  The starting offset in the header.
155         * @param end    The header end offset + 1.
156         *
157         * @return The processed string value.
158         * @exception ParseException
159         */
160        private String getEscapedValue(int start, int end) throws ParseException {
161            StringBuffer value = new StringBuffer();
162    
163            for (int i = start; i < end; i++) {
164                char ch = _header.charAt(i);
165                // is this an escape character?
166                if (ch == '\\') {
167                    i++;
168                    if (i == end) {
169                        throw new ParseException("Invalid escape character");
170                    }
171                    value.append(_header.charAt(i));
172                }
173                // line breaks are ignored, except for naked '\n' characters, which are consider
174                // parts of linear whitespace.
175                else if (ch == '\r') {
176                    // see if this is a CRLF sequence, and skip the second if it is.
177                    if (i < end - 1 && _header.charAt(i + 1) == '\n') {
178                        i++;
179                    }
180                }
181                else {
182                    // just append the ch value.
183                    value.append(ch);
184                }
185            }
186            return value.toString();
187        }
188    
189        /**
190         * Read a comment from the header, applying nesting and escape
191         * rules to the content.
192         *
193         * @return A comment token with the token value.
194         * @exception ParseException
195         */
196        private Token readComment() throws ParseException {
197            int start = pos + 1;
198            int nesting = 1;
199    
200            boolean requiresEscaping = false;
201    
202            // skip to end of comment/string
203            while (++pos < _header.length()) {
204                char ch = _header.charAt(pos);
205                if (ch == ')') {
206                    nesting--;
207                    if (nesting == 0) {
208                        break;
209                    }
210                }
211                else if (ch == '(') {
212                    nesting++;
213                }
214                else if (ch == '\\') {
215                    pos++;
216                    requiresEscaping = true;
217                }
218                // we need to process line breaks also
219                else if (ch == '\r') {
220                    requiresEscaping = true;
221                }
222            }
223    
224            if (nesting != 0) {
225                throw new ParseException("Unbalanced comments");
226            }
227    
228            String value;
229            if (requiresEscaping) {
230                value = getEscapedValue(start, pos);
231            }
232            else {
233                value = _header.substring(start, pos++);
234            }
235            return new Token(Token.COMMENT, value);
236        }
237    
238        /**
239         * Parse out a quoted string from the header, applying escaping
240         * rules to the value.
241         *
242         * @return The QUOTEDSTRING token with the value.
243         * @exception ParseException
244         */
245        private Token readQuotedString() throws ParseException {
246            int start = pos+1;
247            boolean requiresEscaping = false;
248    
249            // skip to end of comment/string
250            while (++pos < _header.length()) {
251                char ch = _header.charAt(pos);
252                if (ch == '"') {
253                    String value;
254                    if (requiresEscaping) {
255                        value = getEscapedValue(start, pos++);
256                    }
257                    else {
258                        value = _header.substring(start, pos++);
259                    }
260                    return new Token(Token.QUOTEDSTRING, value);
261                }
262                else if (ch == '\\') {
263                    pos++;
264                    requiresEscaping = true;
265                }
266                // we need to process line breaks also
267                else if (ch == '\r') {
268                    requiresEscaping = true;
269                }
270            }
271    
272            throw new ParseException("Missing '\"'");
273        }
274    
275        /**
276         * Skip white space in the token string.
277         */
278        private void eatWhiteSpace() {
279            // skip to end of whitespace
280            while (++pos < _header.length()
281                    && WHITE.indexOf(_header.charAt(pos)) != -1)
282                ;
283        }
284    }