AddressParser xref

View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *  http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package javax.mail.internet;
21  
22  import java.io.UnsupportedEncodingException;
23  import java.lang.reflect.Array;
24  import java.util.ArrayList;
25  import java.util.List;
26  
27  class AddressParser {
28  
29      // the validation strictness levels, from most lenient to most conformant.
30      static public final int NONSTRICT = 0;
31      static public final int PARSE_HEADER = 1;
32      static public final int STRICT = 2;
33  
34      // different mailbox types
35      static protected final int UNKNOWN = 0;
36      static protected final int ROUTE_ADDR = 1;
37      static protected final int GROUP_ADDR = 2;
38      static protected final int SIMPLE_ADDR = 3;
39  
40      // constants for token types.
41      static protected final int END_OF_TOKENS = '\0';
42      static protected final int PERIOD = '.';
43      static protected final int LEFT_ANGLE = '<';
44      static protected final int RIGHT_ANGLE = '>';
45      static protected final int COMMA = ',';
46      static protected final int AT_SIGN = '@';
47      static protected final int SEMICOLON = ';';
48      static protected final int COLON = ':';
49      static protected final int QUOTED_LITERAL = '"';
50      static protected final int DOMAIN_LITERAL = '[';
51      static protected final int COMMENT = '(';
52      static protected final int ATOM = 'A';
53      static protected final int WHITESPACE = ' ';
54  
55  
56      // the string we're parsing
57      private String addresses;
58      // the current parsing position
59      private int    position;
60      // the end position of the string
61      private int    end;
62      // the strictness flag
63      private int validationLevel;
64  
65      public AddressParser(String addresses, int validation) {
66          this.addresses = addresses;
67          validationLevel = validation;
68      }
69  
70  
71      /**
72       * Parse an address list into an array of internet addresses.
73       *
74       * @return An array containing all of the non-null addresses in the list.
75       * @exception AddressException
76       *                   Thrown for any validation errors.
77       */
78      public InternetAddress[] parseAddressList() throws AddressException
79      {
80          // get the address as a set of tokens we can process.
81          TokenStream tokens = tokenizeAddress();
82  
83          // get an array list accumulator.
84          ArrayList addressList = new ArrayList();
85  
86          // we process sections of the token stream until we run out of tokens.
87          while (true) {
88              // parse off a single address.  Address lists can have null elements,
89              // so this might return a null value.  The null value does not get added
90              // to the address accumulator.
91              addressList.addAll(parseSingleAddress(tokens, false));
92              // This token should be either a "," delimiter or a stream terminator.  If we're
93              // at the end, time to get out.
94              AddressToken token = tokens.nextToken();
95              if (token.type == END_OF_TOKENS) {
96                  break;
97              }
98          }
99  
100         return (InternetAddress [])addressList.toArray(new InternetAddress[0]);
101     }
102 
103 
104     /**
105      * Parse a single internet address.  This must be a single address,
106      * not an address list.
107      *
108      * @exception AddressException
109      */
110     public InternetAddress parseAddress() throws AddressException
111     {
112         // get the address as a set of tokens we can process.
113         TokenStream tokens = tokenizeAddress();
114 
115         // parse off a single address.  Address lists can have null elements,
116         // so this might return a null value.  The null value does not get added
117         // to the address accumulator.
118         List addressList = parseSingleAddress(tokens, false);
119         // we must get exactly one address back from this.
120         if (addressList.isEmpty()) {
121             throw new AddressException("Null address", addresses, 0);
122         }
123         // this could be a simple list of blank delimited tokens.  Ensure we only got one back.
124         if (addressList.size() > 1) {
125             throw new AddressException("Illegal Address", addresses, 0);
126         }
127 
128         // This token must be a stream stream terminator, or we have an error.
129         AddressToken token = tokens.nextToken();
130         if (token.type != END_OF_TOKENS) {
131             illegalAddress("Illegal Address", token);
132         }
133 
134         return (InternetAddress)addressList.get(0);
135     }
136 
137 
138     /**
139      * Validate an internet address.  This must be a single address,
140      * not a list of addresses.  The address also must not contain
141      * and personal information to be valid.
142      *
143      * @exception AddressException
144      */
145     public void validateAddress() throws AddressException
146     {
147         // get the address as a set of tokens we can process.
148         TokenStream tokens = tokenizeAddress();
149 
150         // parse off a single address.  Address lists can have null elements,
151         // so this might return a null value.  The null value does not get added
152         // to the address accumulator.
153         List addressList = parseSingleAddress(tokens, false);
154         if (addressList.isEmpty()) {
155             throw new AddressException("Null address", addresses, 0);
156         }
157 
158         // this could be a simple list of blank delimited tokens.  Ensure we only got one back.
159         if (addressList.size() > 1) {
160             throw new AddressException("Illegal Address", addresses, 0);
161         }
162 
163         InternetAddress address = (InternetAddress)addressList.get(0);
164 
165         // validation occurs on an address that's already been split into personal and address
166         // data.
167         if (address.personal != null) {
168             throw new AddressException("Illegal Address", addresses, 0);
169         }
170         // This token must be a stream stream terminator, or we have an error.
171         AddressToken token = tokens.nextToken();
172         if (token.type != END_OF_TOKENS) {
173             illegalAddress("Illegal Address", token);
174         }
175     }
176 
177 
178     /**
179      * Extract the set of address from a group Internet specification.
180      *
181      * @return An array containing all of the non-null addresses in the list.
182      * @exception AddressException
183      */
184     public InternetAddress[] extractGroupList() throws AddressException
185     {
186         // get the address as a set of tokens we can process.
187         TokenStream tokens = tokenizeAddress();
188 
189         // get an array list accumulator.
190         ArrayList addresses = new ArrayList();
191 
192         AddressToken token = tokens.nextToken();
193 
194         // scan forward to the ':' that starts the group list.  If we don't find one,
195         // this is an exception.
196         while (token.type != COLON) {
197             if (token.type == END_OF_TOKENS) {
198                 illegalAddress("Missing ':'", token);
199             }
200             token = tokens.nextToken();
201         }
202 
203         // we process sections of the token stream until we run out of tokens.
204         while (true) {
205             // parse off a single address.  Address lists can have null elements,
206             // so this might return a null value.  The null value does not get added
207             // to the address accumulator.
208             addresses.addAll(parseSingleAddress(tokens, true));
209             // This token should be either a "," delimiter or a group terminator.  If we're
210             // at the end, this is an error.
211             token = tokens.nextToken();
212             if (token.type == SEMICOLON) {
213                 break;
214             }
215             else if (token.type == END_OF_TOKENS) {
216                 illegalAddress("Missing ';'", token);
217             }
218         }
219 
220         return (InternetAddress [])addresses.toArray(new InternetAddress[0]);
221     }
222 
223 
224     /**
225      * Parse out a single address from a string from a string
226      * of address tokens, returning an InternetAddress object that
227      * represents the address.
228      *
229      * @param tokens The token source for this address.
230      *
231      * @return A parsed out and constructed InternetAddress object for
232      *         the next address.  Returns null if this is an "empty"
233      *         address in a list.
234      * @exception AddressException
235      */
236     private List parseSingleAddress(TokenStream tokens, boolean inGroup) throws AddressException
237     {
238         List parsedAddresses = new ArrayList();
239 
240         // index markers for personal information
241         AddressToken personalStart = null;
242         AddressToken personalEnd = null;
243 
244         // and similar bits for the address information.
245         AddressToken addressStart = null;
246         AddressToken addressEnd = null;
247 
248         // there is a fall-back set of rules allowed that will parse the address as a set of blank delimited
249         // tokens.  However, we do NOT allow this if we encounter any tokens that fall outside of these
250         // rules.  For example, comment fields and quoted strings will disallow the very lenient rule set.
251         boolean nonStrictRules = true;
252 
253         // we don't know the type of address yet
254         int addressType = UNKNOWN;
255 
256         // the parsing goes in two stages.  Stage one runs through the tokens locating the bounds
257         // of the address we're working on, resolving the personal information, and also validating
258         // some of the larger scale syntax features of an address (matched delimiters for routes and
259         // groups, invalid nesting checks, etc.).
260 
261         // get the next token from the queue and save this.  We're going to scan ahead a bit to
262         // figure out what type of address we're looking at, then reset to do the actually parsing
263         // once we've figured out a form.
264         AddressToken first = tokens.nextToken();
265         // push it back on before starting processing.
266         tokens.pushToken(first);
267 
268         // scan ahead for a trigger token that tells us what we've got.
269         while (addressType == UNKNOWN) {
270 
271             AddressToken token = tokens.nextToken();
272             switch (token.type) {
273                 // skip these for now...after we've processed everything and found that this is a simple
274                 // address form, then we'll check for a leading comment token in the first position and use
275                 // if as personal information.
276                 case COMMENT:
277                     // comments do, however, denote that this must be parsed according to RFC822 rules.
278                     nonStrictRules = false;
279                     break;
280 
281                 // a semi-colon when processing a group is an address terminator.  we need to
282                 // process this like a comma then
283                 case SEMICOLON:
284                     if (inGroup) {
285                         // we need to push the terminator back on for the caller to see.
286                         tokens.pushToken(token);
287                         // if we've not tagged any tokens as being the address beginning, so this must be a
288                         // null address.
289                         if (addressStart == null) {
290                             // just return the empty list from this.
291                             return parsedAddresses;
292                         }
293                         // the end token is the back part.
294                         addressEnd = tokens.previousToken(token);
295                         // without a '<' for a route addr, we can't distinguish address tokens from personal data.
296                         // We'll use a leading comment, if there is one.
297                         personalStart = null;
298                         // this is just a simple form.
299                         addressType = SIMPLE_ADDR;
300                         break;
301                     }
302 
303                 // NOTE:  The above falls through if this is not a group.
304 
305                 // any of these tokens are a real token that can be the start of an address.  Many of
306                 // them are not valid as first tokens in this context, but we flag them later if validation
307                 // has been requested.  For now, we just mark these as the potential address start.
308                 case DOMAIN_LITERAL:
309                 case QUOTED_LITERAL:
310                     // this set of tokens require fuller RFC822 parsing, so turn off the flag.
311                     nonStrictRules = false;
312 
313                 case ATOM:
314                 case AT_SIGN:
315                 case PERIOD:
316                     // if we're not determined the start of the address yet, then check to see if we
317                     // need to consider this the personal start.
318                     if (addressStart == null) {
319                         if (personalStart == null) {
320                             personalStart = token;
321                         }
322                         // This is the first real token of the address, which at this point can
323                         // be either the personal info or the first token of the address.  If we hit
324                         // an address terminator without encountering either a route trigger or group
325                         // trigger, then this is the real address.
326                         addressStart = token;
327                     }
328                     break;
329 
330                 // a LEFT_ANGLE indicates we have a full RFC822 mailbox form.  The leading phrase
331                 // is the personal info.  The address is inside the brackets.
332                 case LEFT_ANGLE:
333                     // a route address automatically switches off the blank-delimited token mode.
334                     nonStrictRules = false;
335                     // this is a route address
336                     addressType = ROUTE_ADDR;
337                     // the address is placed in the InternetAddress object without the route
338                     // brackets, so our start is one past this.
339                     addressStart = tokens.nextRealToken();
340                     // push this back on the queue so the scanner picks it up properly.
341                     tokens.pushToken(addressStart);
342                     // make sure we flag the end of the personal section too.
343                     if (personalStart != null) {
344                         personalEnd = tokens.previousToken(token);
345                     }
346                     // scan the rest of a route address.
347                     addressEnd = scanRouteAddress(tokens, false);
348                     break;
349 
350                 // a COLON indicates this is a group specifier...parse the group.
351                 case COLON:
352                     // Colons would not be valid in simple lists, so turn it off.
353                     nonStrictRules = false;
354                     // if we're scanning a group, we shouldn't encounter a ":".  This is a
355                     // recursion error if found.
356                     if (inGroup) {
357                         illegalAddress("Nested group element", token);
358                     }
359                     addressType = GROUP_ADDR;
360                     // groups don't have any personal sections.
361                     personalStart = null;
362                     // our real start was back at the beginning
363                     addressStart = first;
364                     addressEnd = scanGroupAddress(tokens);
365                     break;
366 
367                 // a semi colon can the same as a comma if we're processing a group.
368 
369 
370                 // reached the end of string...this might be a null address, or one of the very simple name
371                 // forms used for non-strict RFC822 versions.  Reset, and try that form
372                 case END_OF_TOKENS:
373                     // if we're scanning a group, we shouldn't encounter an end token.  This is an
374                     // error if found.
375                     if (inGroup) {
376                         illegalAddress("Missing ';'", token);
377                     }
378 
379                     // NOTE:  fall through from above.
380 
381                 // this is either a terminator for an address list or a a group terminator.
382                 case COMMA:
383                     // we need to push the terminator back on for the caller to see.
384                     tokens.pushToken(token);
385                     // if we've not tagged any tokens as being the address beginning, so this must be a
386                     // null address.
387                     if (addressStart == null) {
388                         // just return the empty list from this.
389                         return parsedAddresses;
390                     }
391                     // the end token is the back part.
392                     addressEnd = tokens.previousToken(token);
393                     // without a '<' for a route addr, we can't distinguish address tokens from personal data.
394                     // We'll use a leading comment, if there is one.
395                     personalStart = null;
396                     // this is just a simple form.
397                     addressType = SIMPLE_ADDR;
398                     break;
399 
400                 // right angle tokens are pushed, because parsing of the bracketing is not necessarily simple.
401                 // we need to flag these here.
402                 case RIGHT_ANGLE:
403                     illegalAddress("Unexpected '>'", token);
404 
405             }
406         }
407 
408         String personal = null;
409 
410         // if we have personal data, then convert it to a string value.
411         if (personalStart != null) {
412             TokenStream personalTokens = tokens.section(personalStart, personalEnd);
413             personal = personalToString(personalTokens);
414         }
415         // if we have a simple address, then check the first token to see if it's a comment.  For simple addresses,
416         // we'll accept the first comment token as the personal information.
417         else {
418             if (addressType == SIMPLE_ADDR && first.type == COMMENT) {
419                 personal = first.value;
420             }
421         }
422 
423         TokenStream addressTokens = tokens.section(addressStart, addressEnd);
424 
425         // if this is one of the strictly RFC822 types, then we always validate the address.  If this is a
426         // a simple address, then we only validate if strict parsing rules are in effect or we've been asked
427         // to validate.
428         if (validationLevel != PARSE_HEADER) {
429             switch (addressType) {
430                 case GROUP_ADDR:
431                     validateGroup(addressTokens);
432                     break;
433 
434                 case ROUTE_ADDR:
435                     validateRouteAddr(addressTokens, false);
436                     break;
437 
438                 case SIMPLE_ADDR:
439                     // this is a conditional validation
440                     validateSimpleAddress(addressTokens);
441                     break;
442             }
443         }
444 
445         // more complex addresses and addresses containing tokens other than just simple addresses
446         // need proper handling.
447         if (validationLevel != NONSTRICT || addressType != SIMPLE_ADDR || !nonStrictRules) {
448             // we might have traversed this already when we validated, so reset the
449             // position before using this again.
450             addressTokens.reset();
451             String address = addressToString(addressTokens);
452 
453             // get the parsed out sections as string values.
454             InternetAddress result = new InternetAddress();
455             result.setAddress(address);
456             try {
457                 result.setPersonal(personal);
458             } catch (UnsupportedEncodingException e) {
459             }
460             // even though we have a single address, we return this as an array.  Simple addresses
461             // can be produce an array of items, so we need to return everything.
462             parsedAddresses.add(result);
463             return parsedAddresses;
464         }
465         else {
466             addressTokens.reset();
467 
468             TokenStream nextAddress = addressTokens.getBlankDelimitedToken();
469             while (nextAddress != null) {
470                 String address = addressToString(nextAddress);
471                 // get the parsed out sections as string values.
472                 InternetAddress result = new InternetAddress();
473                 result.setAddress(address);
474                 parsedAddresses.add(result);
475                 nextAddress = addressTokens.getBlankDelimitedToken();
476             }
477             return parsedAddresses;
478         }
479     }
480 
481 
482     /**
483      * Scan the token stream, parsing off a route addr spec.  This
484      * will do some basic syntax validation, but will not actually
485      * validate any of the address information.  Comments will be
486      * discarded.
487      *
488      * @param tokens The stream of tokens.
489      *
490      * @return The last token of the route address (the one preceeding the
491      *         terminating '>'.
492      */
493     private AddressToken scanRouteAddress(TokenStream tokens, boolean inGroup) throws AddressException {
494         // get the first token and ensure we have something between the "<" and ">".
495         AddressToken token = tokens.nextRealToken();
496         // the last processed non-whitespace token, which is the actual address end once the
497         // right angle bracket is encountered.
498 
499         AddressToken previous = null;
500 
501         // if this route-addr has route information, the first token after the '<' must be a '@'.
502         // this determines if/where a colon or comma can appear.
503         boolean inRoute = token.type == AT_SIGN;
504 
505         // now scan until we reach the terminator.  The only validation is done on illegal characters.
506         while (true) {
507             switch (token.type) {
508                 // The following tokens are all valid between the brackets, so just skip over them.
509                 case ATOM:
510                 case QUOTED_LITERAL:
511                 case DOMAIN_LITERAL:
512                 case PERIOD:
513                 case AT_SIGN:
514                     break;
515 
516                 case COLON:
517                     // if not processing route information, this is illegal.
518                     if (!inRoute) {
519                         illegalAddress("Unexpected ':'", token);
520                     }
521                     // this is the end of the route information, the rules now change.
522                     inRoute = false;
523                     break;
524 
525                 case COMMA:
526                     // if not processing route information, this is illegal.
527                     if (!inRoute) {
528                         illegalAddress("Unexpected ','", token);
529                     }
530                     break;
531 
532                 case RIGHT_ANGLE:
533                     // if previous is null, we've had a route address which is "<>".  That's illegal.
534                     if (previous == null) {
535                         illegalAddress("Illegal address", token);
536                     }
537                     // step to the next token..this had better be either a comma for another address or
538                     // the very end of the address list .
539                     token = tokens.nextRealToken();
540                     // if we're scanning part of a group, then the allowed terminators are either ',' or ';'.
541                     if (inGroup) {
542                         if (token.type != COMMA && token.type != SEMICOLON) {
543                             illegalAddress("Illegal address", token);
544                         }
545                     }
546                     // a normal address should have either a ',' for a list or the end.
547                     else {
548                         if (token.type != COMMA && token.type != END_OF_TOKENS) {
549                             illegalAddress("Illegal address", token);
550                         }
551                     }
552                     // we need to push the termination token back on.
553                     tokens.pushToken(token);
554                     // return the previous token as the updated position.
555                     return previous;
556 
557                 case END_OF_TOKENS:
558                     illegalAddress("Missing '>'", token);
559 
560                 // now for the illegal ones in this context.
561                 case SEMICOLON:
562                     illegalAddress("Unexpected ';'", token);
563 
564                 case LEFT_ANGLE:
565                     illegalAddress("Unexpected '<'", token);
566             }
567             // remember the previous token.
568             previous = token;
569             token = tokens.nextRealToken();
570         }
571     }
572 
573 
574     /**
575      * Scan the token stream, parsing off a group address.  This
576      * will do some basic syntax validation, but will not actually
577      * validate any of the address information.  Comments will be
578      * ignored.
579      *
580      * @param tokens The stream of tokens.
581      *
582      * @return The last token of the group address (the terminating ':").
583      */
584     private AddressToken scanGroupAddress(TokenStream tokens) throws AddressException {
585         // A group does not require that there be anything between the ':' and ';".  This is
586         // just a group with an empty list.
587         AddressToken token = tokens.nextRealToken();
588 
589         // now scan until we reach the terminator.  The only validation is done on illegal characters.
590         while (true) {
591             switch (token.type) {
592                 // The following tokens are all valid in group addresses, so just skip over them.
593                 case ATOM:
594                 case QUOTED_LITERAL:
595                 case DOMAIN_LITERAL:
596                 case PERIOD:
597                 case AT_SIGN:
598                 case COMMA:
599                     break;
600 
601                 case COLON:
602                      illegalAddress("Nested group", token);
603 
604                 // route address within a group specifier....we need to at least verify the bracket nesting
605                 // and higher level syntax of the route.
606                 case LEFT_ANGLE:
607                     scanRouteAddress(tokens, true);
608                     break;
609 
610                 // the only allowed terminator is the ';'
611                 case END_OF_TOKENS:
612                     illegalAddress("Missing ';'", token);
613 
614                 // now for the illegal ones in this context.
615                 case SEMICOLON:
616                     // verify there's nothing illegal after this.
617                     AddressToken next = tokens.nextRealToken();
618                     if (next.type != COMMA && next.type != END_OF_TOKENS) {
619                         illegalAddress("Illegal address", token);
620                     }
621                     // don't forget to put this back on...our caller will need it.
622                     tokens.pushToken(next);
623                     return token;
624 
625                 case RIGHT_ANGLE:
626                     illegalAddress("Unexpected '>'", token);
627             }
628             token = tokens.nextRealToken();
629         }
630     }
631 
632 
633     /**
634      * Parse the provided internet address into a set of tokens.  This
635      * phase only does a syntax check on the tokens.  The interpretation
636      * of the tokens is the next phase.
637      *
638      * @exception AddressException
639      */
640     private TokenStream tokenizeAddress() throws AddressException {
641 
642         // get a list for the set of tokens
643         TokenStream tokens = new TokenStream();
644 
645         end = addresses.length();    // our parsing end marker
646 
647         // now scan along the string looking for the special characters in an internet address.
648         while (moreCharacters()) {
649             char ch = currentChar();
650 
651             switch (ch) {
652                 // start of a comment bit...ignore everything until we hit a closing paren.
653                 case '(':
654                     scanComment(tokens);
655                     break;
656                 // a closing paren found outside of normal processing.
657                 case ')':
658                     syntaxError("Unexpected ')'", position);
659 
660 
661                 // start of a quoted string
662                 case '"':
663                     scanQuotedLiteral(tokens);
664                     break;
665                 // domain literal
666                 case '[':
667                     scanDomainLiteral(tokens);
668                     break;
669 
670                 // a naked closing bracket...not valid except as part of a domain literal.
671                 case ']':
672                     syntaxError("Unexpected ']'", position);
673 
674                 // special character delimiters
675                 case '<':
676                     tokens.addToken(new AddressToken(LEFT_ANGLE, position));
677                     nextChar();
678                     break;
679 
680                 // a naked closing bracket...not valid without a starting one, but
681                 // we need to handle this in context.
682                 case '>':
683                     tokens.addToken(new AddressToken(RIGHT_ANGLE, position));
684                     nextChar();
685                     break;
686                 case ':':
687                     tokens.addToken(new AddressToken(COLON, position));
688                     nextChar();
689                     break;
690                 case ',':
691                     tokens.addToken(new AddressToken(COMMA, position));
692                     nextChar();
693                     break;
694                 case '.':
695                     tokens.addToken(new AddressToken(PERIOD, position));
696                     nextChar();
697                     break;
698                 case ';':
699                     tokens.addToken(new AddressToken(SEMICOLON, position));
700                     nextChar();
701                     break;
702                 case '@':
703                     tokens.addToken(new AddressToken(AT_SIGN, position));
704                     nextChar();
705                     break;
706 
707                 // white space characters.  These are mostly token delimiters, but there are some relaxed
708                 // situations where they get processed, so we need to add a white space token for the first
709                 // one we encounter in a span.
710                 case ' ':
711                 case '\t':
712                 case '\r':
713                 case '\n':
714                     // add a single white space token
715                     tokens.addToken(new AddressToken(WHITESPACE, position));
716 
717                     nextChar();
718                     // step over any space characters, leaving us positioned either at the end
719                     // or the first
720                     while (moreCharacters()) {
721                         char nextChar = currentChar();
722                         if (nextChar == ' ' || nextChar == '\t' || nextChar == '\r' || nextChar == '\n') {
723                             nextChar();
724                         }
725                         else {
726                             break;
727                         }
728                     }
729                     break;
730 
731                 // potentially an atom...if it starts with an allowed atom character, we
732                 // parse out the token, otherwise this is invalid.
733                 default:
734                     if (ch < 040 || ch >= 0177) {
735                         syntaxError("Illegal character in address", position);
736                     }
737 
738                     scanAtom(tokens);
739                     break;
740             }
741         }
742 
743         // for this end marker, give an end position.
744         tokens.addToken(new AddressToken(END_OF_TOKENS, addresses.length()));
745         return tokens;
746     }
747 
748 
749     /**
750      * Step to the next character position while parsing.
751      */
752     private void nextChar() {
753         position++;
754     }
755 
756 
757     /**
758      * Retrieve the character at the current parsing position.
759      *
760      * @return The current character.
761      */
762     private char currentChar() {
763         return addresses.charAt(position);
764     }
765 
766     /**
767      * Test if there are more characters left to parse.
768      *
769      * @return True if we've hit the last character, false otherwise.
770      */
771     private boolean moreCharacters() {
772         return position < end;
773     }
774 
775 
776     /**
777      * Parse a quoted string as specified by the RFC822 specification.
778      *
779      * @param tokens The TokenStream where the parsed out token is added.
780      */
781     private void scanQuotedLiteral(TokenStream tokens) throws AddressException {
782         StringBuffer value = new StringBuffer();
783 
784         // save the start position for the token.
785         int startPosition = position;
786         // step over the quote delimiter.
787         nextChar();
788 
789         while (moreCharacters()) {
790             char ch = currentChar();
791 
792             // is this an escape char?
793             if (ch == '\\') {
794                 // step past this, and grab the following character
795                 nextChar();
796                 if (!moreCharacters()) {
797                     syntaxError("Missing '\"'", position);
798                 }
799                 value.append(currentChar());
800             }
801             // end of the string?
802             else if (ch == '"') {
803                 // return the constructed string.
804                 tokens.addToken(new AddressToken(value.toString(), QUOTED_LITERAL, position));
805                 // step over the close delimiter for the benefit of the next token.
806                 nextChar();
807                 return;
808             }
809             // the RFC822 spec disallows CR characters.
810             else if (ch == '\r') {
811                 syntaxError("Illegal line end in literal", position);
812             }
813             else
814             {
815                 value.append(ch);
816             }
817             nextChar();
818         }
819         // missing delimiter
820         syntaxError("Missing '\"'", position);
821     }
822 
823 
824     /**
825      * Parse a domain literal as specified by the RFC822 specification.
826      *
827      * @param tokens The TokenStream where the parsed out token is added.
828      */
829     private void scanDomainLiteral(TokenStream tokens) throws AddressException {
830         StringBuffer value = new StringBuffer();
831 
832         int startPosition = position;
833         // step over the quote delimiter.
834         nextChar();
835 
836         while (moreCharacters()) {
837             char ch = currentChar();
838 
839             // is this an escape char?
840             if (ch == '\\') {
841                 // because domain literals don't get extra escaping, we render them
842                 // with the escaped characters intact.  Therefore, append the '\' escape
843                 // first, then append the escaped character without examination.
844                 value.append(currentChar());
845                 // step past this, and grab the following character
846                 nextChar();
847                 if (!moreCharacters()) {
848                     syntaxError("Missing '\"'", position);
849                 }
850                 value.append(currentChar());
851             }
852             // end of the string?
853             else if (ch == ']') {
854                 // return the constructed string.
855                 tokens.addToken(new AddressToken(value.toString(), DOMAIN_LITERAL, startPosition));
856                 // step over the close delimiter for the benefit of the next token.
857                 nextChar();
858                 return;
859             }
860             // the RFC822 spec says no nesting
861             else if (ch == '[') {
862                 syntaxError("Unexpected '['", position);
863             }
864             // carriage returns are similarly illegal.
865             else if (ch == '\r') {
866                 syntaxError("Illegal line end in domain literal", position);
867             }
868             else
869             {
870                 value.append(ch);
871             }
872             nextChar();
873         }
874         // missing delimiter
875         syntaxError("Missing ']'", position);
876     }
877 
878     /**
879      * Scan an atom in an internet address, using the RFC822 rules
880      * for atom delimiters.
881      *
882      * @param tokens The TokenStream where the parsed out token is added.
883      */
884     private void scanAtom(TokenStream tokens) throws AddressException {
885         int start = position;
886         nextChar();
887         while (moreCharacters()) {
888 
889             char ch = currentChar();
890             if (isAtom(ch)) {
891                 nextChar();
892             }
893             else {
894                 break;
895             }
896         }
897 
898         // return the scanned part of the string.
899         tokens.addToken(new AddressToken(addresses.substring(start, position), ATOM, start));
900     }
901 
902 
903     /**
904      * Parse an internet address comment field as specified by
905      * RFC822.  Includes support for quoted characters and nesting.
906      *
907      * @param tokens The TokenStream where the parsed out token is added.
908      */
909     private void scanComment(TokenStream tokens) throws AddressException {
910         StringBuffer value = new StringBuffer();
911 
912         int startPosition = position;
913         // step past the start character
914         nextChar();
915 
916         // we're at the top nesting level on the comment.
917         int nest = 1;
918 
919         // scan while we have more characters.
920         while (moreCharacters()) {
921             char ch = currentChar();
922             // escape character?
923             if (ch == '\\') {
924                 // step over this...if escaped, we must have at least one more character
925                 // in the string.
926                 nextChar();
927                 if (!moreCharacters()) {
928                     syntaxError("Missing ')'", position);
929                 }
930                 value.append(currentChar());
931             }
932             // nested comment?
933             else if (ch == '(') {
934                 // step the nesting level...we treat the comment as a single unit, with the delimiters
935                 // for the nested comments embedded in the middle
936                 nest++;
937                 value.append(ch);
938             }
939             // is this the comment close?
940             else if (ch == ')') {
941                 // reduce the nesting level.  If we still have more to process, add the delimiter character
942                 // and keep going.
943                 nest--;
944                 if (nest > 0) {
945                     value.append(ch);
946                 }
947                 else {
948                     // step past this and return.  The outermost comment delimiter is not included in
949                     // the string value, since this is frequently used as personal data on the
950                     // InternetAddress objects.
951                     nextChar();
952                     tokens.addToken(new AddressToken(value.toString(), COMMENT, startPosition));
953                     return;
954                 }
955             }
956             else if (ch == '\r') {
957                 syntaxError("Illegal line end in comment", position);
958             }
959             else {
960                 value.append(ch);
961             }
962             // step to the next character.
963             nextChar();
964         }
965         // ran out of data before seeing the closing bit, not good
966         syntaxError("Missing ')'", position);
967     }
968 
969 
970     /**
971      * Validate the syntax of an RFC822 group internet address specification.
972      *
973      * @param tokens The stream of tokens for the address.
974      *
975      * @exception AddressException
976      */
977     private void validateGroup(TokenStream tokens) throws AddressException {
978         // we know already this is an address in the form "phrase:group;".  Now we need to validate the
979         // elements.
980 
981         int phraseCount = 0;
982 
983         AddressToken token = tokens.nextRealToken();
984         // now scan to the semi color, ensuring we have only word or comment tokens.
985         while (token.type != COLON) {
986             // only these tokens are allowed here.
987             if (token.type != ATOM && token.type != QUOTED_LITERAL) {
988                 invalidToken(token);
989             }
990             phraseCount++;
991             token = tokens.nextRealToken();
992         }
993 
994 
995         // RFC822 groups require a leading phrase in group specifiers.
996         if (phraseCount == 0) {
997             illegalAddress("Missing group identifier phrase", token);
998         }
999 
1000         // now we do the remainder of the parsing using the initial phrase list as the sink...the entire
1001         // address will be converted to a string later.
1002 
1003         // ok, we only know this has been valid up to the ":", now we have some real checks to perform.
1004         while (true) {
1005             // go scan off a mailbox.  if everything goes according to plan, we should be positioned at either
1006             // a comma or a semicolon.
1007             validateGroupMailbox(tokens);
1008 
1009             token = tokens.nextRealToken();
1010 
1011             // we're at the end of the group.  Make sure this is truely the end.
1012             if (token.type == SEMICOLON) {
1013                 token = tokens.nextRealToken();
1014                 if (token.type != END_OF_TOKENS) {
1015                     illegalAddress("Illegal group address", token);
1016                 }
1017                 return;
1018             }
1019 
1020             // if not a semicolon, this better be a comma.
1021             else if (token.type != COMMA) {
1022                 illegalAddress("Illegal group address", token);
1023             }
1024         }
1025     }
1026 
1027 
1028     /**
1029      * Validate the syntax of single mailbox within a group address.
1030      *
1031      * @param tokens The stream of tokens representing the address.
1032      *
1033      * @exception AddressException
1034      */
1035     private void validateGroupMailbox(TokenStream tokens) throws AddressException {
1036         AddressToken first = tokens.nextRealToken();
1037         // is this just a null address in the list?  then push the terminator back and return.
1038         if (first.type == COMMA || first.type == SEMICOLON) {
1039             tokens.pushToken(first);
1040             return;
1041         }
1042 
1043         // now we need to scan ahead to see if we can determine the type.
1044         AddressToken token = first;
1045 
1046 
1047         // we need to scan forward to figure out what sort of address this is.
1048         while (first != null) {
1049             switch (token.type) {
1050                 // until we know the context, these are all just ignored.
1051                 case QUOTED_LITERAL:
1052                 case ATOM:
1053                     break;
1054 
1055                 // a LEFT_ANGLE indicates we have a full RFC822 mailbox form.  The leading phrase
1056                 // is the personal info.  The address is inside the brackets.
1057                 case LEFT_ANGLE:
1058                     tokens.pushToken(first);
1059                     validatePhrase(tokens, false);
1060                     validateRouteAddr(tokens, true);
1061                     return;
1062 
1063                 // we've hit a period as the first non-word token.  This should be part of a local-part
1064                 // of an address.
1065                 case PERIOD:
1066                 // we've hit an "@" as the first non-word token.  This is probably a simple address in
1067                 // the form "user@domain".
1068                 case AT_SIGN:
1069                     tokens.pushToken(first);
1070                     validateAddressSpec(tokens);
1071                     return;
1072 
1073                 // reached the end of string...this might be a null address, or one of the very simple name
1074                 // forms used for non-strict RFC822 versions.  Reset, and try that form
1075                 case COMMA:
1076                 // this is the end of the group...handle it like a comma for now.
1077                 case SEMICOLON:
1078                     tokens.pushToken(first);
1079                     validateAddressSpec(tokens);
1080                     return;
1081 
1082                 case END_OF_TOKENS:
1083                     illegalAddress("Missing ';'", token);
1084 
1085             }
1086             token = tokens.nextRealToken();
1087         }
1088     }
1089 
1090 
1091     /**
1092      * Utility method for throwing an AddressException caused by an
1093      * unexpected primitive token.
1094      *
1095      * @param token  The token causing the problem (must not be a value type token).
1096      *
1097      * @exception AddressException
1098      */
1099     private void invalidToken(AddressToken token) throws AddressException {
1100         illegalAddress("Unexpected '" + token.type + "'", token);
1101     }
1102 
1103 
1104     /**
1105      * Raise an error about illegal syntax.
1106      *
1107      * @param message  The message used in the thrown exception.
1108      * @param position The parsing position within the string.
1109      *
1110      * @exception AddressException
1111      */
1112     private void syntaxError(String message, int position) throws AddressException
1113     {
1114         throw new AddressException(message, addresses, position);
1115     }
1116 
1117 
1118     /**
1119      * Throw an exception based on the position of an invalid token.
1120      *
1121      * @param message The exception message.
1122      * @param token   The token causing the error.  This tokens position is used
1123      *                in the exception information.
1124      */
1125     private void illegalAddress(String message, AddressToken token) throws AddressException {
1126         throw new AddressException(message, addresses, token.position);
1127     }
1128 
1129 
1130     /**
1131      * Validate that a required phrase exists.
1132      *
1133      * @param tokens   The set of tokens to validate. positioned at the phrase start.
1134      * @param required A flag indicating whether the phrase is optional or required.
1135      *
1136      * @exception AddressException
1137      */
1138     private void validatePhrase(TokenStream tokens, boolean required) throws AddressException {
1139         // we need to have at least one WORD token in the phrase...everything is optional
1140         // after that.
1141         AddressToken token = tokens.nextRealToken();
1142         if (token.type != ATOM && token.type != QUOTED_LITERAL) {
1143             if (required) {
1144                 illegalAddress("Missing group phrase", token);
1145             }
1146         }
1147 
1148         // now scan forward to the end of the phrase
1149         token = tokens.nextRealToken();
1150         while (token.type == ATOM || token.type == QUOTED_LITERAL) {
1151             token = tokens.nextRealToken();
1152         }
1153     }
1154 
1155 
1156     /**
1157      * validate a routeaddr specification
1158      *
1159      * @param tokens  The tokens representing the address portion (personal information
1160      *                already removed).
1161      * @param ingroup true indicates we're validating a route address inside a
1162      *                group list.  false indicates we're validating a standalone
1163      *                address.
1164      *
1165      * @exception AddressException
1166      */
1167     private void validateRouteAddr(TokenStream tokens, boolean ingroup) throws AddressException {
1168         // get the next real token.
1169         AddressToken token = tokens.nextRealToken();
1170         // if this is an at sign, then we have a list of domains to parse.
1171         if (token.type == AT_SIGN) {
1172             // push the marker token back in for the route parser, and step past that part.
1173             tokens.pushToken(token);
1174             validateRoute(tokens);
1175         }
1176         else {
1177             // we need to push this back on to validate the local part.
1178             tokens.pushToken(token);
1179         }
1180 
1181         // now we expect to see an address spec.
1182         validateAddressSpec(tokens);
1183 
1184         token = tokens.nextRealToken();
1185         if (ingroup) {
1186             // if we're validating within a group specification, the angle brackets are still there (and
1187             // required).
1188             if (token.type != RIGHT_ANGLE) {
1189                 illegalAddress("Missing '>'", token);
1190             }
1191         }
1192         else {
1193             // the angle brackets were removed to make this an address, so we should be done.  Make sure we
1194             // have a terminator here.
1195             if (token.type != END_OF_TOKENS) {
1196                 illegalAddress("Illegal Address", token);
1197             }
1198         }
1199     }
1200 
1201 
1202 
1203     /**
1204      * Validate a simple address in the form "user@domain".
1205      *
1206      * @param tokens The stream of tokens representing the address.
1207      */
1208     private void validateSimpleAddress(TokenStream tokens) throws AddressException {
1209 
1210         // the validation routines occur after addresses have been split into
1211         // personal and address forms.  Therefore, our validation begins directly
1212         // with the first token.
1213         validateAddressSpec(tokens);
1214 
1215         // get the next token and see if there is something here...anything but the terminator is an error
1216         AddressToken token = tokens.nextRealToken();
1217         if (token.type != END_OF_TOKENS) {
1218             illegalAddress("Illegal Address", token);
1219         }
1220     }
1221 
1222     /**
1223      * Validate the addr-spec portion of an address.  RFC822 requires
1224      * this be of the form "local-part@domain".  However, javamail also
1225      * allows simple address of the form "local-part".  We only require
1226      * the domain if an '@' is encountered.
1227      *
1228      * @param tokens
1229      */
1230     private void validateAddressSpec(TokenStream tokens) throws AddressException {
1231         // all addresses, even the simple ones, must have at least a local part.
1232         validateLocalPart(tokens);
1233 
1234         // now see if we have a domain portion to look at.
1235         AddressToken token = tokens.nextRealToken();
1236         if (token.type == AT_SIGN) {
1237             validateDomain(tokens);
1238         }
1239         else {
1240             // put this back for termination
1241             tokens.pushToken(token);
1242         }
1243 
1244     }
1245 
1246 
1247     /**
1248      * Validate the route portion of a route-addr.  This is a list
1249      * of domain values in the form 1#("@" domain) ":".
1250      *
1251      * @param tokens The token stream holding the address information.
1252      */
1253     private void validateRoute(TokenStream tokens) throws AddressException {
1254         while (true) {
1255             AddressToken token = tokens.nextRealToken();
1256             // if this is the first part of the list, go parse off a domain
1257             if (token.type == AT_SIGN) {
1258                 validateDomain(tokens);
1259             }
1260             // another element in the list?  Go around again
1261             else if (token.type == COMMA) {
1262                 continue;
1263             }
1264             // the list is terminated by a colon...stop this part of the validation once we hit one.
1265             else if (token.type == COLON) {
1266                 return;
1267             }
1268             // the list is terminated by a colon.  If this isn't one of those, we have an error.
1269             else {
1270                 illegalAddress("Missing ':'", token);
1271             }
1272         }
1273     }
1274 
1275 
1276     /**
1277      * Parse the local part of an address spec.  The local part
1278      * is a series of "words" separated by ".".
1279      */
1280     private void validateLocalPart(TokenStream tokens) throws AddressException {
1281         while (true) {
1282             // get the token.
1283             AddressToken token = tokens.nextRealToken();
1284 
1285             // this must be either an atom or a literal.
1286             if (token.type != ATOM && token.type != QUOTED_LITERAL) {
1287                 illegalAddress("Invalid local part", token);
1288             }
1289 
1290             // get the next token (white space and comments ignored)
1291             token = tokens.nextRealToken();
1292             // if this is a period, we continue parsing
1293             if (token.type != PERIOD) {
1294                 tokens.pushToken(token);
1295                 // return the token
1296                 return;
1297             }
1298         }
1299     }
1300 
1301 
1302 
1303     /**
1304      * Parse a domain name of the form sub-domain *("." sub-domain).
1305      * a sub-domain is either an atom or a domain-literal.
1306      */
1307     private void validateDomain(TokenStream tokens) throws AddressException {
1308         while (true) {
1309             // get the token.
1310             AddressToken token = tokens.nextRealToken();
1311 
1312             // this must be either an atom or a domain literal.
1313             if (token.type != ATOM && token.type != DOMAIN_LITERAL) {
1314                 illegalAddress("Invalid domain", token);
1315             }
1316 
1317             // get the next token (white space is ignored)
1318             token = tokens.nextRealToken();
1319             // if this is a period, we continue parsing
1320             if (token.type != PERIOD) {
1321                 // return the token
1322                 tokens.pushToken(token);
1323                 return;
1324             }
1325         }
1326     }
1327 
1328     /**
1329      * Convert a list of word tokens into a phrase string.  The
1330      * rules for this are a little hard to puzzle out, but there
1331      * is a logic to it.  If the list is empty, the phrase is
1332      * just a null value.
1333      *
1334      * If we have a phrase, then the quoted strings need to
1335      * handled appropriately.  In multi-token phrases, the
1336      * quoted literals are concatenated with the quotes intact,
1337      * regardless of content.  Thus a phrase that comes in like this:
1338      *
1339      * "Geronimo" Apache
1340      *
1341      * gets converted back to the same string.
1342      *
1343      * If there is just a single token in the phrase, AND the token
1344      * is a quoted string AND the string does not contain embedded
1345      * special characters ("\.,@<>()[]:;), then the phrase
1346      * is expressed as an atom.  Thus the literal
1347      *
1348      *    "Geronimo"
1349      *
1350      * becomes
1351      *
1352      *    Geronimo
1353      *
1354      * but
1355      *
1356      *    "(Geronimo)"
1357      *
1358      * remains
1359      *
1360      *    "(Geronimo)"
1361      *
1362      * Note that we're generating a canonical form of the phrase,
1363      * which removes comments and reduces linear whitespace down
1364      * to a single separator token.
1365      *
1366      * @param phrase An array list of phrase tokens (which may be empty).
1367      */
1368     private String personalToString(TokenStream tokens) {
1369 
1370         // no tokens in the stream?  This is a null value.
1371         AddressToken token = tokens.nextToken();
1372 
1373         if (token.type == END_OF_TOKENS) {
1374             return null;
1375         }
1376 
1377         AddressToken next = tokens.nextToken();
1378 
1379         // single element phrases get special treatment.
1380         if (next.type == END_OF_TOKENS) {
1381             // this can be used directly...if it contains special characters, quoting will be
1382             // performed when it's converted to a string value.
1383             return token.value;
1384         }
1385 
1386         // reset to the beginning
1387         tokens.pushToken(token);
1388 
1389         // have at least two tokens,
1390         StringBuffer buffer = new StringBuffer();
1391 
1392         // get the first token.  After the first, we add these as blank delimited values.
1393         token = tokens.nextToken();
1394         addTokenValue(token, buffer);
1395 
1396         token = tokens.nextToken();
1397         while (token.type != END_OF_TOKENS) {
1398             // add a blank separator
1399             buffer.append(' ');
1400             // now add the next tokens value
1401             addTokenValue(token, buffer);
1402             token = tokens.nextToken();
1403         }
1404         // and return the canonicalized value
1405         return buffer.toString();
1406     }
1407 
1408 
1409     /**
1410      * take a canonicalized set of address tokens and reformat it back into a string value,
1411      * inserting whitespace where appropriate.
1412      *
1413      * @param tokens The set of tokens representing the address.
1414      *
1415      * @return The string value of the tokens.
1416      */
1417     private String addressToString(TokenStream tokens) {
1418         StringBuffer buffer = new StringBuffer();
1419 
1420         // this flag controls whether we insert a blank delimiter between tokens as
1421         // we advance through the list.  Blanks are only inserted between consequtive value tokens.
1422         // Initially, this is false, then we flip it to true whenever we add a value token, and
1423         // back to false for any special character token.
1424         boolean spaceRequired = false;
1425 
1426         // we use nextToken rather than nextRealToken(), since we need to process the comments also.
1427         AddressToken token = tokens.nextToken();
1428 
1429         // now add each of the tokens
1430         while (token.type != END_OF_TOKENS) {
1431             switch (token.type) {
1432                 // the word tokens are the only ones where we need to worry about adding
1433                 // whitespace delimiters.
1434                 case ATOM:
1435                 case QUOTED_LITERAL:
1436                     // was the last token also a word?  Insert a blank first.
1437                     if (spaceRequired) {
1438                         buffer.append(' ');
1439                     }
1440                     addTokenValue(token, buffer);
1441                     // let the next iteration know we just added a word to the list.
1442                     spaceRequired = true;
1443                     break;
1444 
1445                 // these special characters are just added in.  The constants for the character types
1446                 // were carefully selected to be the character value in question.  This allows us to
1447                 // just append the value.
1448                 case LEFT_ANGLE:
1449                 case RIGHT_ANGLE:
1450                 case COMMA:
1451                 case COLON:
1452                 case AT_SIGN:
1453                 case SEMICOLON:
1454                 case PERIOD:
1455                     buffer.append((char)token.type);
1456                     // no spaces around specials
1457                     spaceRequired = false;
1458                     break;
1459 
1460                 // Domain literals self delimiting...we can just append them and turn off the space flag.
1461                 case DOMAIN_LITERAL:
1462                     addTokenValue(token, buffer);
1463                     spaceRequired = false;
1464                     break;
1465 
1466                 // Comments are also self delimitin.
1467                 case COMMENT:
1468                     addTokenValue(token, buffer);
1469                     spaceRequired = false;
1470                     break;
1471             }
1472             token = tokens.nextToken();
1473         }
1474         return buffer.toString();
1475     }
1476 
1477 
1478     /**
1479      * Append a value token on to a string buffer used to create
1480      * the canonicalized string value.
1481      *
1482      * @param token  The token we're adding.
1483      * @param buffer The target string buffer.
1484      */
1485     private void addTokenValue(AddressToken token, StringBuffer buffer) {
1486         // atom values can be added directly.
1487         if (token.type == ATOM) {
1488             buffer.append(token.value);
1489         }
1490         // a literal value?  Add this as a quoted string
1491         else if (token.type == QUOTED_LITERAL) {
1492             buffer.append(formatQuotedString(token.value));
1493         }
1494         // could be a domain literal of the form "[value]"
1495         else if (token.type == DOMAIN_LITERAL) {
1496             buffer.append('[');
1497             buffer.append(token.value);
1498             buffer.append(']');
1499         }
1500         // comments also have values
1501         else if (token.type == COMMENT) {
1502             buffer.append('(');
1503             buffer.append(token.value);
1504             buffer.append(')');
1505         }
1506     }
1507 
1508 
1509 
1510     private static final byte[] CHARMAP = {
1511         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  0x06, 0x02, 0x06, 0x02, 0x02, 0x06, 0x02, 0x02,
1512         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
1513         0x04, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,  0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00,
1514         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00,
1515 
1516         0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1517         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00,
1518         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1519         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
1520     };
1521 
1522     private static final byte FLG_SPECIAL = 1;
1523     private static final byte FLG_CONTROL = 2;
1524     private static final byte FLG_SPACE = 4;
1525 
1526     private static boolean isSpace(char ch) {
1527         if (ch > '\u007f') {
1528             return false;
1529         } else {
1530             return (CHARMAP[ch] & FLG_SPACE) != 0;
1531         }
1532     }
1533 
1534     /**
1535      * Quick test to see if a character is an allowed atom character
1536      * or not.
1537      *
1538      * @param ch     The test character.
1539      *
1540      * @return true if this character is allowed in atoms, false for any
1541      *         control characters, special characters, or blanks.
1542      */
1543     public static boolean isAtom(char ch) {
1544         if (ch > '\u007f') {
1545             return false;
1546         }
1547         else if (ch == ' ') {
1548             return false;
1549         }
1550         else {
1551             return (CHARMAP[ch] & (FLG_SPECIAL | FLG_CONTROL)) == 0;
1552         }
1553     }
1554 
1555     /**
1556      * Tests one string to determine if it contains any of the
1557      * characters in a supplied test string.
1558      *
1559      * @param s      The string we're testing.
1560      * @param chars  The set of characters we're testing against.
1561      *
1562      * @return true if any of the characters is found, false otherwise.
1563      */
1564     public static boolean containsCharacters(String s, String chars)
1565     {
1566         for (int i = 0; i < s.length(); i++) {
1567             if (chars.indexOf(s.charAt(i)) >= 0) {
1568                 return true;
1569             }
1570         }
1571         return false;
1572     }
1573 
1574 
1575     /**
1576      * Tests if a string contains any non-special characters that
1577      * would require encoding the value as a quoted string rather
1578      * than a simple atom value.
1579      *
1580      * @param s      The test string.
1581      *
1582      * @return True if the string contains only blanks or allowed atom
1583      *         characters.
1584      */
1585     public static boolean containsSpecials(String s)
1586     {
1587         for (int i = 0; i < s.length(); i++) {
1588             char ch = s.charAt(i);
1589             // must be either a blank or an allowed atom char.
1590             if (ch == ' ' || isAtom(ch)) {
1591                 continue;
1592             }
1593             else {
1594                 return true;
1595             }
1596         }
1597         return false;
1598     }
1599 
1600 
1601     /**
1602      * Tests if a string contains any non-special characters that
1603      * would require encoding the value as a quoted string rather
1604      * than a simple atom value.
1605      *
1606      * @param s      The test string.
1607      *
1608      * @return True if the string contains only blanks or allowed atom
1609      *         characters.
1610      */
1611     public static boolean isAtom(String s)
1612     {
1613         for (int i = 0; i < s.length(); i++) {
1614             char ch = s.charAt(i);
1615             // must be an allowed atom character
1616             if (!isAtom(ch)) {
1617                 return false;
1618             }
1619         }
1620         return true;
1621     }
1622 
1623     /**
1624      * Apply RFC822 quoting rules to a literal string value.  This
1625      * will search the string to see if there are any characters that
1626      * require special escaping, and apply the escapes.  If the
1627      * string is just a string of blank-delimited atoms, the string
1628      * value is returned without quotes.
1629      *
1630      * @param s      The source string.
1631      *
1632      * @return A version of the string as a valid RFC822 quoted literal.
1633      */
1634     public static String quoteString(String s) {
1635 
1636         // only backslash and double quote require escaping.  If the string does not
1637         // contain any of these, then we can just slap on some quotes and go.
1638         if (s.indexOf('\\') == -1 && s.indexOf('"') == -1) {
1639             // if the string is an atom (or a series of blank-delimited atoms), we can just return it directly.
1640             if (!containsSpecials(s)) {
1641                 return s;
1642             }
1643             StringBuffer buffer = new StringBuffer(s.length() + 2);
1644             buffer.append('"');
1645             buffer.append(s);
1646             buffer.append('"');
1647             return buffer.toString();
1648         }
1649 
1650         // get a buffer sufficiently large for the string, two quote characters, and a "reasonable"
1651         // number of escaped values.
1652         StringBuffer buffer = new StringBuffer(s.length() + 10);
1653         buffer.append('"');
1654 
1655         // now check all of the characters.
1656         for (int i = 0; i < s.length(); i++) {
1657             char ch = s.charAt(i);
1658             // character requiring escaping?
1659             if (ch == '\\' || ch == '"') {
1660                 // add an extra backslash
1661                 buffer.append('\\');
1662             }
1663             // and add on the character
1664             buffer.append(ch);
1665         }
1666         buffer.append('"');
1667         return buffer.toString();
1668     }
1669 
1670     /**
1671      * Apply RFC822 quoting rules to a literal string value.  This
1672      * will search the string to see if there are any characters that
1673      * require special escaping, and apply the escapes.  The returned
1674      * value is enclosed in quotes.
1675      *
1676      * @param s      The source string.
1677      *
1678      * @return A version of the string as a valid RFC822 quoted literal.
1679      */
1680     public static String formatQuotedString(String s) {
1681         // only backslash and double quote require escaping.  If the string does not
1682         // contain any of these, then we can just slap on some quotes and go.
1683         if (s.indexOf('\\') == -1 && s.indexOf('"') == -1) {
1684             StringBuffer buffer = new StringBuffer(s.length() + 2);
1685             buffer.append('"');
1686             buffer.append(s);
1687             buffer.append('"');
1688             return buffer.toString();
1689         }
1690 
1691         // get a buffer sufficiently large for the string, two quote characters, and a "reasonable"
1692         // number of escaped values.
1693         StringBuffer buffer = new StringBuffer(s.length() + 10);
1694         buffer.append('"');
1695 
1696         // now check all of the characters.
1697         for (int i = 0; i < s.length(); i++) {
1698             char ch = s.charAt(i);
1699             // character requiring escaping?
1700             if (ch == '\\' || ch == '"') {
1701                 // add an extra backslash
1702                 buffer.append('\\');
1703             }
1704             // and add on the character
1705             buffer.append(ch);
1706         }
1707         buffer.append('"');
1708         return buffer.toString();
1709     }
1710 
1711     public class TokenStream {
1712         // the set of tokens in the parsed address list, as determined by RFC822 syntax rules.
1713         private List tokens;
1714 
1715         // the current token position
1716         int currentToken = 0;
1717 
1718 
1719         /**
1720          * Default constructor for a TokenStream.  This creates an
1721          * empty TokenStream for purposes of tokenizing an address.
1722          * It is the creator's responsibility to terminate the stream
1723          * with a terminator token.
1724          */
1725         public TokenStream() {
1726             tokens = new ArrayList();
1727         }
1728 
1729 
1730         /**
1731          * Construct a TokenStream from a list of tokens.  A terminator
1732          * token is added to the end.
1733          *
1734          * @param tokens An existing token list.
1735          */
1736         public TokenStream(List tokens) {
1737             this.tokens = tokens;
1738             tokens.add(new AddressToken(END_OF_TOKENS, -1));
1739         }
1740 
1741         /**
1742          * Add an address token to the token list.
1743          *
1744          * @param t      The new token to add to the list.
1745          */
1746         public void addToken(AddressToken token) {
1747             tokens.add(token);
1748         }
1749 
1750         /**
1751          * Get the next token at the cursor position, advancing the
1752          * position accordingly.
1753          *
1754          * @return The token at the current token position.
1755          */
1756         public AddressToken nextToken() {
1757             AddressToken token = (AddressToken)tokens.get(currentToken++);
1758             // we skip over white space tokens when operating in this mode, so
1759             // check the token and iterate until we get a non-white space.
1760             while (token.type == WHITESPACE) {
1761                 token = (AddressToken)tokens.get(currentToken++);
1762             }
1763             return token;
1764         }
1765 
1766 
1767         /**
1768          * Get the next token at the cursor position, without advancing the
1769          * position.
1770          *
1771          * @return The token at the current token position.
1772          */
1773         public AddressToken currentToken() {
1774             // return the current token and step the cursor
1775             return (AddressToken)tokens.get(currentToken);
1776         }
1777 
1778 
1779         /**
1780          * Get the next non-comment token from the string.  Comments are ignored, except as personal information
1781          * for very simple address specifications.
1782          *
1783          * @return A token guaranteed not to be a whitespace token.
1784          */
1785         public AddressToken nextRealToken()
1786         {
1787             AddressToken token = nextToken();
1788             if (token.type == COMMENT) {
1789                 token = nextToken();
1790             }
1791             return token;
1792         }
1793 
1794         /**
1795          * Push a token back on to the queue, making the index of this
1796          * token the current cursor position.
1797          *
1798          * @param token  The token to push.
1799          */
1800         public void pushToken(AddressToken token) {
1801             // just reset the cursor to the token's index position.
1802             currentToken = tokenIndex(token);
1803         }
1804 
1805         /**
1806          * Get the next token after a given token, without advancing the
1807          * token position.
1808          *
1809          * @param token  The token we're retrieving a token relative to.
1810          *
1811          * @return The next token in the list.
1812          */
1813         public AddressToken nextToken(AddressToken token) {
1814             return (AddressToken)tokens.get(tokenIndex(token) + 1);
1815         }
1816 
1817 
1818         /**
1819          * Return the token prior to a given token.
1820          *
1821          * @param token  The token used for the index.
1822          *
1823          * @return The token prior to the index token in the list.
1824          */
1825         public AddressToken previousToken(AddressToken token) {
1826             return (AddressToken)tokens.get(tokenIndex(token) - 1);
1827         }
1828 
1829 
1830         /**
1831          * Retrieve a token at a given index position.
1832          *
1833          * @param index  The target index.
1834          */
1835         public AddressToken getToken(int index)
1836         {
1837             return (AddressToken)tokens.get(index);
1838         }
1839 
1840 
1841         /**
1842          * Retrieve the index of a particular token in the stream.
1843          *
1844          * @param token  The target token.
1845          *
1846          * @return The index of the token within the stream.  Returns -1 if this
1847          *         token is somehow not in the stream.
1848          */
1849         public int tokenIndex(AddressToken token) {
1850             return tokens.indexOf(token);
1851         }
1852 
1853 
1854         /**
1855          * Extract a new TokenStream running from the start token to the
1856          * token preceeding the end token.
1857          *
1858          * @param start  The starting token of the section.
1859          * @param end    The last token (+1) for the target section.
1860          *
1861          * @return A new TokenStream object for processing this section of tokens.
1862          */
1863         public TokenStream section(AddressToken start, AddressToken end) {
1864             int startIndex = tokenIndex(start);
1865             int endIndex = tokenIndex(end);
1866 
1867             // List.subList() returns a list backed by the original list.  Since we need to add a
1868             // terminator token to this list when we take the sublist, we need to manually copy the
1869             // references so we don't end up munging the original list.
1870             ArrayList list = new ArrayList(endIndex - startIndex + 2);
1871 
1872             for (int i = startIndex; i <= endIndex; i++) {
1873                 list.add(tokens.get(i));
1874             }
1875             return new TokenStream(list);
1876         }
1877 
1878 
1879         /**
1880          * Reset the token position back to the beginning of the
1881          * stream.
1882          */
1883         public void reset() {
1884             currentToken = 0;
1885         }
1886 
1887         /**
1888          * Scan forward looking for a non-blank token.
1889          *
1890          * @return The first non-blank token in the stream.
1891          */
1892         public AddressToken getNonBlank()
1893         {
1894             AddressToken token = currentToken();
1895             while (token.type == WHITESPACE) {
1896                 currentToken++;
1897                 token = currentToken();
1898             }
1899             return token;
1900         }
1901 
1902 
1903         /**
1904          * Extract a blank delimited token from a TokenStream.  A blank
1905          * delimited token is the set of tokens up to the next real whitespace
1906          * token (comments not included).
1907          *
1908          * @return A TokenStream object with the new set of tokens.
1909          */
1910         public TokenStream getBlankDelimitedToken()
1911         {
1912             // get the next non-whitespace token.
1913             AddressToken first = getNonBlank();
1914             // if this is the end, we return null.
1915             if (first.type == END_OF_TOKENS) {
1916                 return null;
1917             }
1918 
1919             AddressToken last = first;
1920 
1921             // the methods for retrieving tokens skip over whitespace, so we're going to process this
1922             // by index.
1923             currentToken++;
1924 
1925             AddressToken token = currentToken();
1926             while (true) {
1927                 // if this is our marker, then pluck out the section and return it.
1928                 if (token.type == END_OF_TOKENS || token.type == WHITESPACE) {
1929                     return section(first, last);
1930                 }
1931                 last = token;
1932                 currentToken++;
1933                 // we accept any and all tokens here.
1934                 token = currentToken();
1935             }
1936         }
1937 
1938         /**
1939          * Return the index of the current cursor position.
1940          *
1941          * @return The integer index of the current token.
1942          */
1943         public int currentIndex() {
1944             return currentToken;
1945         }
1946 
1947         public void dumpTokens()
1948         {
1949             System.out.println(">>>>>>>>> Start dumping TokenStream tokens");
1950             for (int i = 0; i < tokens.size(); i++) {
1951                 System.out.println("-------- Token: " + tokens.get(i));
1952             }
1953 
1954             System.out.println("++++++++ cursor position=" + currentToken);
1955             System.out.println(">>>>>>>>> End dumping TokenStream tokens");
1956         }
1957     }
1958 
1959 
1960     /**
1961      * Simple utility class for representing address tokens.
1962      */
1963     public class AddressToken {
1964 
1965         // the token type
1966         int type;
1967 
1968         // string value of the token (can be null)
1969         String value;
1970 
1971         // position of the token within the address string.
1972         int position;
1973 
1974         AddressToken(int type, int position)
1975         {
1976             this.type = type;
1977             this.value = null;
1978             this.position = position;
1979         }
1980 
1981         AddressToken(String value, int type, int position)
1982         {
1983             this.type = type;
1984             this.value = value;
1985             this.position = position;
1986         }
1987 
1988         public String toString()
1989         {
1990             if (type == END_OF_TOKENS) {
1991                 return "AddressToken:  type=END_OF_TOKENS";
1992             }
1993             if (value == null) {
1994                 return "AddressToken:  type=" + (char)type;
1995             }
1996             else {
1997                 return "AddressToken:  type=" + (char)type + " value=" + value;
1998             }
1999         }
2000     }
2001 }
2002