View Javadoc

1   /**
2    *
3    * Copyright 2003-2006 The Apache Software Foundation
4    *
5    *  Licensed under the Apache License, Version 2.0 (the "License");
6    *  you may not use this file except in compliance with the License.
7    *  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  
18  package javax.mail.internet;
19  
20  import java.io.UnsupportedEncodingException;
21  import java.lang.reflect.Array;
22  import java.util.ArrayList;
23  import java.util.List;
24  
25  class AddressParser {
26  
27      // the validation strictness levels, from most lenient to most conformant.
28      static public final int NONSTRICT = 0;
29      static public final int PARSE_HEADER = 1;
30      static public final int STRICT = 2;
31  
32      // different mailbox types
33      static protected final int UNKNOWN = 0;
34      static protected final int ROUTE_ADDR = 1;
35      static protected final int GROUP_ADDR = 2;
36      static protected final int SIMPLE_ADDR = 3;
37  
38      // constants for token types.
39      static protected final int END_OF_TOKENS = '\0';
40      static protected final int PERIOD = '.';
41      static protected final int LEFT_ANGLE = '<';
42      static protected final int RIGHT_ANGLE = '>';
43      static protected final int COMMA = ',';
44      static protected final int AT_SIGN = '@';
45      static protected final int SEMICOLON = ';';
46      static protected final int COLON = ':';
47      static protected final int QUOTED_LITERAL = '"';
48      static protected final int DOMAIN_LITERAL = '[';
49      static protected final int COMMENT = '(';
50      static protected final int ATOM = 'A';
51      static protected final int WHITESPACE = ' ';
52  
53  
54      // the string we're parsing
55      private String addresses;
56      // the current parsing position
57      private int    position;
58      // the end position of the string
59      private int    end;
60      // the strictness flag
61      private int validationLevel;
62  
63      public AddressParser(String addresses, int validation) {
64          this.addresses = addresses;
65          validationLevel = validation;
66      }
67  
68  
69      /**
70       * Parse an address list into an array of internet addresses.
71       *
72       * @return An array containing all of the non-null addresses in the list.
73       * @exception AddressException
74       *                   Thrown for any validation errors.
75       */
76      public InternetAddress[] parseAddressList() throws AddressException
77      {
78          // get the address as a set of tokens we can process.
79          TokenStream tokens = tokenizeAddress();
80  
81          // get an array list accumulator.
82          ArrayList addressList = new ArrayList();
83  
84          // we process sections of the token stream until we run out of tokens.
85          while (true) {
86              // parse off a single address.  Address lists can have null elements,
87              // so this might return a null value.  The null value does not get added
88              // to the address accumulator.
89              addressList.addAll(parseSingleAddress(tokens, false));
90              // This token should be either a "," delimiter or a stream terminator.  If we're
91              // at the end, time to get out.
92              AddressToken token = tokens.nextToken();
93              if (token.type == END_OF_TOKENS) {
94                  break;
95              }
96          }
97  
98          return (InternetAddress [])addressList.toArray(new InternetAddress[0]);
99      }
100 
101 
102     /**
103      * Parse a single internet address.  This must be a single address,
104      * not an address list.
105      *
106      * @exception AddressException
107      */
108     public InternetAddress parseAddress() throws AddressException
109     {
110         // get the address as a set of tokens we can process.
111         TokenStream tokens = tokenizeAddress();
112 
113         // parse off a single address.  Address lists can have null elements,
114         // so this might return a null value.  The null value does not get added
115         // to the address accumulator.
116         List addressList = parseSingleAddress(tokens, false);
117         // we must get exactly one address back from this.
118         if (addressList.isEmpty()) {
119             throw new AddressException("Null address", addresses, 0);
120         }
121         // this could be a simple list of blank delimited tokens.  Ensure we only got one back.
122         if (addressList.size() > 1) {
123             throw new AddressException("Illegal Address", addresses, 0);
124         }
125 
126         // This token must be a stream stream terminator, or we have an error.
127         AddressToken token = tokens.nextToken();
128         if (token.type != END_OF_TOKENS) {
129             illegalAddress("Illegal Address", token);
130         }
131 
132         return (InternetAddress)addressList.get(0);
133     }
134 
135 
136     /**
137      * Validate an internet address.  This must be a single address,
138      * not a list of addresses.  The address also must not contain
139      * and personal information to be valid.
140      *
141      * @exception AddressException
142      */
143     public void validateAddress() throws AddressException
144     {
145         // get the address as a set of tokens we can process.
146         TokenStream tokens = tokenizeAddress();
147 
148         // parse off a single address.  Address lists can have null elements,
149         // so this might return a null value.  The null value does not get added
150         // to the address accumulator.
151         List addressList = parseSingleAddress(tokens, false);
152         if (addressList.isEmpty()) {
153             throw new AddressException("Null address", addresses, 0);
154         }
155 
156         // this could be a simple list of blank delimited tokens.  Ensure we only got one back.
157         if (addressList.size() > 1) {
158             throw new AddressException("Illegal Address", addresses, 0);
159         }
160 
161         InternetAddress address = (InternetAddress)addressList.get(0);
162 
163         // validation occurs on an address that's already been split into personal and address
164         // data.
165         if (address.personal != null) {
166             throw new AddressException("Illegal Address", addresses, 0);
167         }
168         // This token must be a stream stream terminator, or we have an error.
169         AddressToken token = tokens.nextToken();
170         if (token.type != END_OF_TOKENS) {
171             illegalAddress("Illegal Address", token);
172         }
173     }
174 
175 
176     /**
177      * Extract the set of address from a group Internet specification.
178      *
179      * @return An array containing all of the non-null addresses in the list.
180      * @exception AddressException
181      */
182     public InternetAddress[] extractGroupList() throws AddressException
183     {
184         // get the address as a set of tokens we can process.
185         TokenStream tokens = tokenizeAddress();
186 
187         // get an array list accumulator.
188         ArrayList addresses = new ArrayList();
189 
190         AddressToken token = tokens.nextToken();
191 
192         // scan forward to the ':' that starts the group list.  If we don't find one,
193         // this is an exception.
194         while (token.type != COLON) {
195             if (token.type == END_OF_TOKENS) {
196                 illegalAddress("Missing ':'", token);
197             }
198             token = tokens.nextToken();
199         }
200 
201         // we process sections of the token stream until we run out of tokens.
202         while (true) {
203             // parse off a single address.  Address lists can have null elements,
204             // so this might return a null value.  The null value does not get added
205             // to the address accumulator.
206             addresses.addAll(parseSingleAddress(tokens, true));
207             // This token should be either a "," delimiter or a group terminator.  If we're
208             // at the end, this is an error.
209             token = tokens.nextToken();
210             if (token.type == SEMICOLON) {
211                 break;
212             }
213             else if (token.type == END_OF_TOKENS) {
214                 illegalAddress("Missing ';'", token);
215             }
216         }
217 
218         return (InternetAddress [])addresses.toArray(new InternetAddress[0]);
219     }
220 
221 
222     /**
223      * Parse out a single address from a string from a string
224      * of address tokens, returning an InternetAddress object that
225      * represents the address.
226      *
227      * @param tokens The token source for this address.
228      *
229      * @return A parsed out and constructed InternetAddress object for
230      *         the next address.  Returns null if this is an "empty"
231      *         address in a list.
232      * @exception AddressException
233      */
234     private List parseSingleAddress(TokenStream tokens, boolean inGroup) throws AddressException
235     {
236         List parsedAddresses = new ArrayList();
237 
238         // index markers for personal information
239         AddressToken personalStart = null;
240         AddressToken personalEnd = null;
241 
242         // and similar bits for the address information.
243         AddressToken addressStart = null;
244         AddressToken addressEnd = null;
245 
246         // there is a fall-back set of rules allowed that will parse the address as a set of blank delimited
247         // tokens.  However, we do NOT allow this if we encounter any tokens that fall outside of these
248         // rules.  For example, comment fields and quoted strings will disallow the very lenient rule set.
249         boolean nonStrictRules = true;
250 
251         // we don't know the type of address yet
252         int addressType = UNKNOWN;
253 
254         // the parsing goes in two stages.  Stage one runs through the tokens locating the bounds
255         // of the address we're working on, resolving the personal information, and also validating
256         // some of the larger scale syntax features of an address (matched delimiters for routes and
257         // groups, invalid nesting checks, etc.).
258 
259         // get the next token from the queue and save this.  We're going to scan ahead a bit to
260         // figure out what type of address we're looking at, then reset to do the actually parsing
261         // once we've figured out a form.
262         AddressToken first = tokens.nextToken();
263         // push it back on before starting processing.
264         tokens.pushToken(first);
265 
266         // scan ahead for a trigger token that tells us what we've got.
267         while (addressType == UNKNOWN) {
268 
269             AddressToken token = tokens.nextToken();
270             switch (token.type) {
271                 // skip these for now...after we've processed everything and found that this is a simple
272                 // address form, then we'll check for a leading comment token in the first position and use
273                 // if as personal information.
274                 case COMMENT:
275                     // comments do, however, denote that this must be parsed according to RFC822 rules.
276                     nonStrictRules = false;
277                     break;
278 
279                 // a semi-colon when processing a group is an address terminator.  we need to
280                 // process this like a comma then
281                 case SEMICOLON:
282                     if (inGroup) {
283                         // we need to push the terminator back on for the caller to see.
284                         tokens.pushToken(token);
285                         // if we've not tagged any tokens as being the address beginning, so this must be a
286                         // null address.
287                         if (addressStart == null) {
288                             // just return the empty list from this.
289                             return parsedAddresses;
290                         }
291                         // the end token is the back part.
292                         addressEnd = tokens.previousToken(token);
293                         // without a '<' for a route addr, we can't distinguish address tokens from personal data.
294                         // We'll use a leading comment, if there is one.
295                         personalStart = null;
296                         // this is just a simple form.
297                         addressType = SIMPLE_ADDR;
298                         break;
299                     }
300 
301                 // NOTE:  The above falls through if this is not a group.
302 
303                 // any of these tokens are a real token that can be the start of an address.  Many of
304                 // them are not valid as first tokens in this context, but we flag them later if validation
305                 // has been requested.  For now, we just mark these as the potential address start.
306                 case DOMAIN_LITERAL:
307                 case QUOTED_LITERAL:
308                     // this set of tokens require fuller RFC822 parsing, so turn off the flag.
309                     nonStrictRules = false;
310 
311                 case ATOM:
312                 case AT_SIGN:
313                 case PERIOD:
314                     // if we're not determined the start of the address yet, then check to see if we
315                     // need to consider this the personal start.
316                     if (addressStart == null) {
317                         if (personalStart == null) {
318                             personalStart = token;
319                         }
320                         // This is the first real token of the address, which at this point can
321                         // be either the personal info or the first token of the address.  If we hit
322                         // an address terminator without encountering either a route trigger or group
323                         // trigger, then this is the real address.
324                         addressStart = token;
325                     }
326                     break;
327 
328                 // a LEFT_ANGLE indicates we have a full RFC822 mailbox form.  The leading phrase
329                 // is the personal info.  The address is inside the brackets.
330                 case LEFT_ANGLE:
331                     // a route address automatically switches off the blank-delimited token mode.
332                     nonStrictRules = false;
333                     // this is a route address
334                     addressType = ROUTE_ADDR;
335                     // the address is placed in the InternetAddress object without the route
336                     // brackets, so our start is one past this.
337                     addressStart = tokens.nextRealToken();
338                     // push this back on the queue so the scanner picks it up properly.
339                     tokens.pushToken(addressStart);
340                     // make sure we flag the end of the personal section too.
341                     if (personalStart != null) {
342                         personalEnd = tokens.previousToken(token);
343                     }
344                     // scan the rest of a route address.
345                     addressEnd = scanRouteAddress(tokens, false);
346                     break;
347 
348                 // a COLON indicates this is a group specifier...parse the group.
349                 case COLON:
350                     // Colons would not be valid in simple lists, so turn it off.
351                     nonStrictRules = false;
352                     // if we're scanning a group, we shouldn't encounter a ":".  This is a
353                     // recursion error if found.
354                     if (inGroup) {
355                         illegalAddress("Nested group element", token);
356                     }
357                     addressType = GROUP_ADDR;
358                     // groups don't have any personal sections.
359                     personalStart = null;
360                     // our real start was back at the beginning
361                     addressStart = first;
362                     addressEnd = scanGroupAddress(tokens);
363                     break;
364 
365                 // a semi colon can the same as a comma if we're processing a group.
366 
367 
368                 // reached the end of string...this might be a null address, or one of the very simple name
369                 // forms used for non-strict RFC822 versions.  Reset, and try that form
370                 case END_OF_TOKENS:
371                     // if we're scanning a group, we shouldn't encounter an end token.  This is an
372                     // error if found.
373                     if (inGroup) {
374                         illegalAddress("Missing ';'", token);
375                     }
376 
377                     // NOTE:  fall through from above.
378 
379                 // this is either a terminator for an address list or a a group terminator.
380                 case COMMA:
381                     // we need to push the terminator back on for the caller to see.
382                     tokens.pushToken(token);
383                     // if we've not tagged any tokens as being the address beginning, so this must be a
384                     // null address.
385                     if (addressStart == null) {
386                         // just return the empty list from this.
387                         return parsedAddresses;
388                     }
389                     // the end token is the back part.
390                     addressEnd = tokens.previousToken(token);
391                     // without a '<' for a route addr, we can't distinguish address tokens from personal data.
392                     // We'll use a leading comment, if there is one.
393                     personalStart = null;
394                     // this is just a simple form.
395                     addressType = SIMPLE_ADDR;
396                     break;
397 
398                 // right angle tokens are pushed, because parsing of the bracketing is not necessarily simple.
399                 // we need to flag these here.
400                 case RIGHT_ANGLE:
401                     illegalAddress("Unexpected '>'", token);
402 
403             }
404         }
405 
406         String personal = null;
407 
408         // if we have personal data, then convert it to a string value.
409         if (personalStart != null) {
410             TokenStream personalTokens = tokens.section(personalStart, personalEnd);
411             personal = personalToString(personalTokens);
412         }
413         // if we have a simple address, then check the first token to see if it's a comment.  For simple addresses,
414         // we'll accept the first comment token as the personal information.
415         else {
416             if (addressType == SIMPLE_ADDR && first.type == COMMENT) {
417                 personal = first.value;
418             }
419         }
420 
421         TokenStream addressTokens = tokens.section(addressStart, addressEnd);
422 
423         // if this is one of the strictly RFC822 types, then we always validate the address.  If this is a
424         // a simple address, then we only validate if strict parsing rules are in effect or we've been asked
425         // to validate.
426         if (validationLevel != PARSE_HEADER) {
427             switch (addressType) {
428                 case GROUP_ADDR:
429                     validateGroup(addressTokens);
430                     break;
431 
432                 case ROUTE_ADDR:
433                     validateRouteAddr(addressTokens, false);
434                     break;
435 
436                 case SIMPLE_ADDR:
437                     // this is a conditional validation
438                     validateSimpleAddress(addressTokens);
439                     break;
440             }
441         }
442 
443         // more complex addresses and addresses containing tokens other than just simple addresses
444         // need proper handling.
445         if (validationLevel != NONSTRICT || addressType != SIMPLE_ADDR || !nonStrictRules) {
446             // we might have traversed this already when we validated, so reset the
447             // position before using this again.
448             addressTokens.reset();
449             String address = addressToString(addressTokens);
450 
451             // get the parsed out sections as string values.
452             InternetAddress result = new InternetAddress();
453             result.setAddress(address);
454             try {
455                 result.setPersonal(personal);
456             } catch (UnsupportedEncodingException e) {
457             }
458             // even though we have a single address, we return this as an array.  Simple addresses
459             // can be produce an array of items, so we need to return everything.
460             parsedAddresses.add(result);
461             return parsedAddresses;
462         }
463         else {
464             addressTokens.reset();
465 
466             TokenStream nextAddress = addressTokens.getBlankDelimitedToken();
467             while (nextAddress != null) {
468                 String address = addressToString(nextAddress);
469                 // get the parsed out sections as string values.
470                 InternetAddress result = new InternetAddress();
471                 result.setAddress(address);
472                 parsedAddresses.add(result);
473                 nextAddress = addressTokens.getBlankDelimitedToken();
474             }
475             return parsedAddresses;
476         }
477     }
478 
479 
480     /**
481      * Scan the token stream, parsing off a route addr spec.  This
482      * will do some basic syntax validation, but will not actually
483      * validate any of the address information.  Comments will be
484      * discarded.
485      *
486      * @param tokens The stream of tokens.
487      *
488      * @return The last token of the route address (the one preceeding the
489      *         terminating '>'.
490      */
491     private AddressToken scanRouteAddress(TokenStream tokens, boolean inGroup) throws AddressException {
492         // get the first token and ensure we have something between the "<" and ">".
493         AddressToken token = tokens.nextRealToken();
494         // the last processed non-whitespace token, which is the actual address end once the
495         // right angle bracket is encountered.
496 
497         AddressToken previous = null;
498 
499         // if this route-addr has route information, the first token after the '<' must be a '@'.
500         // this determines if/where a colon or comma can appear.
501         boolean inRoute = token.type == AT_SIGN;
502 
503         // now scan until we reach the terminator.  The only validation is done on illegal characters.
504         while (true) {
505             switch (token.type) {
506                 // The following tokens are all valid between the brackets, so just skip over them.
507                 case ATOM:
508                 case QUOTED_LITERAL:
509                 case DOMAIN_LITERAL:
510                 case PERIOD:
511                 case AT_SIGN:
512                     break;
513 
514                 case COLON:
515                     // if not processing route information, this is illegal.
516                     if (!inRoute) {
517                         illegalAddress("Unexpected ':'", token);
518                     }
519                     // this is the end of the route information, the rules now change.
520                     inRoute = false;
521                     break;
522 
523                 case COMMA:
524                     // if not processing route information, this is illegal.
525                     if (!inRoute) {
526                         illegalAddress("Unexpected ','", token);
527                     }
528                     break;
529 
530                 case RIGHT_ANGLE:
531                     // if previous is null, we've had a route address which is "<>".  That's illegal.
532                     if (previous == null) {
533                         illegalAddress("Illegal address", token);
534                     }
535                     // step to the next token..this had better be either a comma for another address or
536                     // the very end of the address list .
537                     token = tokens.nextRealToken();
538                     // if we're scanning part of a group, then the allowed terminators are either ',' or ';'.
539                     if (inGroup) {
540                         if (token.type != COMMA && token.type != SEMICOLON) {
541                             illegalAddress("Illegal address", token);
542                         }
543                     }
544                     // a normal address should have either a ',' for a list or the end.
545                     else {
546                         if (token.type != COMMA && token.type != END_OF_TOKENS) {
547                             illegalAddress("Illegal address", token);
548                         }
549                     }
550                     // we need to push the termination token back on.
551                     tokens.pushToken(token);
552                     // return the previous token as the updated position.
553                     return previous;
554 
555                 case END_OF_TOKENS:
556                     illegalAddress("Missing '>'", token);
557 
558                 // now for the illegal ones in this context.
559                 case SEMICOLON:
560                     illegalAddress("Unexpected ';'", token);
561 
562                 case LEFT_ANGLE:
563                     illegalAddress("Unexpected '<'", token);
564             }
565             // remember the previous token.
566             previous = token;
567             token = tokens.nextRealToken();
568         }
569     }
570 
571 
572     /**
573      * Scan the token stream, parsing off a group address.  This
574      * will do some basic syntax validation, but will not actually
575      * validate any of the address information.  Comments will be
576      * ignored.
577      *
578      * @param tokens The stream of tokens.
579      *
580      * @return The last token of the group address (the terminating ':").
581      */
582     private AddressToken scanGroupAddress(TokenStream tokens) throws AddressException {
583         // A group does not require that there be anything between the ':' and ';".  This is
584         // just a group with an empty list.
585         AddressToken token = tokens.nextRealToken();
586 
587         // now scan until we reach the terminator.  The only validation is done on illegal characters.
588         while (true) {
589             switch (token.type) {
590                 // The following tokens are all valid in group addresses, so just skip over them.
591                 case ATOM:
592                 case QUOTED_LITERAL:
593                 case DOMAIN_LITERAL:
594                 case PERIOD:
595                 case AT_SIGN:
596                 case COMMA:
597                     break;
598 
599                 case COLON:
600                      illegalAddress("Nested group", token);
601 
602                 // route address within a group specifier....we need to at least verify the bracket nesting
603                 // and higher level syntax of the route.
604                 case LEFT_ANGLE:
605                     scanRouteAddress(tokens, true);
606                     break;
607 
608                 // the only allowed terminator is the ';'
609                 case END_OF_TOKENS:
610                     illegalAddress("Missing ';'", token);
611 
612                 // now for the illegal ones in this context.
613                 case SEMICOLON:
614                     // verify there's nothing illegal after this.
615                     AddressToken next = tokens.nextRealToken();
616                     if (next.type != COMMA && next.type != END_OF_TOKENS) {
617                         illegalAddress("Illegal address", token);
618                     }
619                     // don't forget to put this back on...our caller will need it.
620                     tokens.pushToken(next);
621                     return token;
622 
623                 case RIGHT_ANGLE:
624                     illegalAddress("Unexpected '>'", token);
625             }
626             token = tokens.nextRealToken();
627         }
628     }
629 
630 
631     /**
632      * Parse the provided internet address into a set of tokens.  This
633      * phase only does a syntax check on the tokens.  The interpretation
634      * of the tokens is the next phase.
635      *
636      * @exception AddressException
637      */
638     private TokenStream tokenizeAddress() throws AddressException {
639 
640         // get a list for the set of tokens
641         TokenStream tokens = new TokenStream();
642 
643         end = addresses.length();    // our parsing end marker
644 
645         // now scan along the string looking for the special characters in an internet address.
646         while (moreCharacters()) {
647             char ch = currentChar();
648 
649             switch (ch) {
650                 // start of a comment bit...ignore everything until we hit a closing paren.
651                 case '(':
652                     scanComment(tokens);
653                     break;
654                 // a closing paren found outside of normal processing.
655                 case ')':
656                     syntaxError("Unexpected ')'", position);
657 
658 
659                 // start of a quoted string
660                 case '"':
661                     scanQuotedLiteral(tokens);
662                     break;
663                 // domain literal
664                 case '[':
665                     scanDomainLiteral(tokens);
666                     break;
667 
668                 // a naked closing bracket...not valid except as part of a domain literal.
669                 case ']':
670                     syntaxError("Unexpected ']'", position);
671 
672                 // special character delimiters
673                 case '<':
674                     tokens.addToken(new AddressToken(LEFT_ANGLE, position));
675                     nextChar();
676                     break;
677 
678                 // a naked closing bracket...not valid without a starting one, but
679                 // we need to handle this in context.
680                 case '>':
681                     tokens.addToken(new AddressToken(RIGHT_ANGLE, position));
682                     nextChar();
683                     break;
684                 case ':':
685                     tokens.addToken(new AddressToken(COLON, position));
686                     nextChar();
687                     break;
688                 case ',':
689                     tokens.addToken(new AddressToken(COMMA, position));
690                     nextChar();
691                     break;
692                 case '.':
693                     tokens.addToken(new AddressToken(PERIOD, position));
694                     nextChar();
695                     break;
696                 case ';':
697                     tokens.addToken(new AddressToken(SEMICOLON, position));
698                     nextChar();
699                     break;
700                 case '@':
701                     tokens.addToken(new AddressToken(AT_SIGN, position));
702                     nextChar();
703                     break;
704 
705                 // white space characters.  These are mostly token delimiters, but there are some relaxed
706                 // situations where they get processed, so we need to add a white space token for the first
707                 // one we encounter in a span.
708                 case ' ':
709                 case '\t':
710                 case '\r':
711                 case '\n':
712                     // add a single white space token
713                     tokens.addToken(new AddressToken(WHITESPACE, position));
714 
715                     nextChar();
716                     // step over any space characters, leaving us positioned either at the end
717                     // or the first
718                     while (moreCharacters()) {
719                         char nextChar = currentChar();
720                         if (nextChar == ' ' || nextChar == '\t' || nextChar == '\r' || nextChar == '\n') {
721                             nextChar();
722                         }
723                         else {
724                             break;
725                         }
726                     }
727                     break;
728 
729                 // potentially an atom...if it starts with an allowed atom character, we
730                 // parse out the token, otherwise this is invalid.
731                 default:
732                     if (ch < 040 || ch >= 0177) {
733                         syntaxError("Illegal character in address", position);
734                     }
735 
736                     scanAtom(tokens);
737                     break;
738             }
739         }
740 
741         // for this end marker, give an end position.
742         tokens.addToken(new AddressToken(END_OF_TOKENS, addresses.length()));
743         return tokens;
744     }
745 
746 
747     /**
748      * Step to the next character position while parsing.
749      */
750     private void nextChar() {
751         position++;
752     }
753 
754 
755     /**
756      * Retrieve the character at the current parsing position.
757      *
758      * @return The current character.
759      */
760     private char currentChar() {
761         return addresses.charAt(position);
762     }
763 
764     /**
765      * Test if there are more characters left to parse.
766      *
767      * @return True if we've hit the last character, false otherwise.
768      */
769     private boolean moreCharacters() {
770         return position < end;
771     }
772 
773 
774     /**
775      * Parse a quoted string as specified by the RFC822 specification.
776      *
777      * @param tokens The TokenStream where the parsed out token is added.
778      */
779     private void scanQuotedLiteral(TokenStream tokens) throws AddressException {
780         StringBuffer value = new StringBuffer();
781 
782         // save the start position for the token.
783         int startPosition = position;
784         // step over the quote delimiter.
785         nextChar();
786 
787         while (moreCharacters()) {
788             char ch = currentChar();
789 
790             // is this an escape char?
791             if (ch == '\\') {
792                 // step past this, and grab the following character
793                 nextChar();
794                 if (!moreCharacters()) {
795                     syntaxError("Missing '\"'", position);
796                 }
797                 value.append(currentChar());
798             }
799             // end of the string?
800             else if (ch == '"') {
801                 // return the constructed string.
802                 tokens.addToken(new AddressToken(value.toString(), QUOTED_LITERAL, position));
803                 // step over the close delimiter for the benefit of the next token.
804                 nextChar();
805                 return;
806             }
807             // the RFC822 spec disallows CR characters.
808             else if (ch == '\r') {
809                 syntaxError("Illegal line end in literal", position);
810             }
811             else
812             {
813                 value.append(ch);
814             }
815             nextChar();
816         }
817         // missing delimiter
818         syntaxError("Missing '\"'", position);
819     }
820 
821 
822     /**
823      * Parse a domain literal as specified by the RFC822 specification.
824      *
825      * @param tokens The TokenStream where the parsed out token is added.
826      */
827     private void scanDomainLiteral(TokenStream tokens) throws AddressException {
828         StringBuffer value = new StringBuffer();
829 
830         int startPosition = position;
831         // step over the quote delimiter.
832         nextChar();
833 
834         while (moreCharacters()) {
835             char ch = currentChar();
836 
837             // is this an escape char?
838             if (ch == '\\') {
839                 // because domain literals don't get extra escaping, we render them
840                 // with the escaped characters intact.  Therefore, append the '\' escape
841                 // first, then append the escaped character without examination.
842                 value.append(currentChar());
843                 // step past this, and grab the following character
844                 nextChar();
845                 if (!moreCharacters()) {
846                     syntaxError("Missing '\"'", position);
847                 }
848                 value.append(currentChar());
849             }
850             // end of the string?
851             else if (ch == ']') {
852                 // return the constructed string.
853                 tokens.addToken(new AddressToken(value.toString(), DOMAIN_LITERAL, startPosition));
854                 // step over the close delimiter for the benefit of the next token.
855                 nextChar();
856                 return;
857             }
858             // the RFC822 spec says no nesting
859             else if (ch == '[') {
860                 syntaxError("Unexpected '['", position);
861             }
862             // carriage returns are similarly illegal.
863             else if (ch == '\r') {
864                 syntaxError("Illegal line end in domain literal", position);
865             }
866             else
867             {
868                 value.append(ch);
869             }
870             nextChar();
871         }
872         // missing delimiter
873         syntaxError("Missing ']'", position);
874     }
875 
876     /**
877      * Scan an atom in an internet address, using the RFC822 rules
878      * for atom delimiters.
879      *
880      * @param tokens The TokenStream where the parsed out token is added.
881      */
882     private void scanAtom(TokenStream tokens) throws AddressException {
883         int start = position;
884         nextChar();
885         while (moreCharacters()) {
886 
887             char ch = currentChar();
888             if (isAtom(ch)) {
889                 nextChar();
890             }
891             else {
892                 break;
893             }
894         }
895 
896         // return the scanned part of the string.
897         tokens.addToken(new AddressToken(addresses.substring(start, position), ATOM, start));
898     }
899 
900 
901     /**
902      * Parse an internet address comment field as specified by
903      * RFC822.  Includes support for quoted characters and nesting.
904      *
905      * @param tokens The TokenStream where the parsed out token is added.
906      */
907     private void scanComment(TokenStream tokens) throws AddressException {
908         StringBuffer value = new StringBuffer();
909 
910         int startPosition = position;
911         // step past the start character
912         nextChar();
913 
914         // we're at the top nesting level on the comment.
915         int nest = 1;
916 
917         // scan while we have more characters.
918         while (moreCharacters()) {
919             char ch = currentChar();
920             // escape character?
921             if (ch == '\\') {
922                 // step over this...if escaped, we must have at least one more character
923                 // in the string.
924                 nextChar();
925                 if (!moreCharacters()) {
926                     syntaxError("Missing ')'", position);
927                 }
928                 value.append(currentChar());
929             }
930             // nested comment?
931             else if (ch == '(') {
932                 // step the nesting level...we treat the comment as a single unit, with the delimiters
933                 // for the nested comments embedded in the middle
934                 nest++;
935                 value.append(ch);
936             }
937             // is this the comment close?
938             else if (ch == ')') {
939                 // reduce the nesting level.  If we still have more to process, add the delimiter character
940                 // and keep going.
941                 nest--;
942                 if (nest > 0) {
943                     value.append(ch);
944                 }
945                 else {
946                     // step past this and return.  The outermost comment delimiter is not included in
947                     // the string value, since this is frequently used as personal data on the
948                     // InternetAddress objects.
949                     nextChar();
950                     tokens.addToken(new AddressToken(value.toString(), COMMENT, startPosition));
951                     return;
952                 }
953             }
954             else if (ch == '\r') {
955                 syntaxError("Illegal line end in comment", position);
956             }
957             else {
958                 value.append(ch);
959             }
960             // step to the next character.
961             nextChar();
962         }
963         // ran out of data before seeing the closing bit, not good
964         syntaxError("Missing ')'", position);
965     }
966 
967 
968     /**
969      * Validate the syntax of an RFC822 group internet address specification.
970      *
971      * @param tokens The stream of tokens for the address.
972      *
973      * @exception AddressException
974      */
975     private void validateGroup(TokenStream tokens) throws AddressException {
976         // we know already this is an address in the form "phrase:group;".  Now we need to validate the
977         // elements.
978 
979         int phraseCount = 0;
980 
981         AddressToken token = tokens.nextRealToken();
982         // now scan to the semi color, ensuring we have only word or comment tokens.
983         while (token.type != COLON) {
984             // only these tokens are allowed here.
985             if (token.type != ATOM && token.type != QUOTED_LITERAL) {
986                 invalidToken(token);
987             }
988             phraseCount++;
989             token = tokens.nextRealToken();
990         }
991 
992 
993         // RFC822 groups require a leading phrase in group specifiers.
994         if (phraseCount == 0) {
995             illegalAddress("Missing group identifier phrase", token);
996         }
997 
998         // now we do the remainder of the parsing using the initial phrase list as the sink...the entire
999         // address will be converted to a string later.
1000 
1001         // ok, we only know this has been valid up to the ":", now we have some real checks to perform.
1002         while (true) {
1003             // go scan off a mailbox.  if everything goes according to plan, we should be positioned at either
1004             // a comma or a semicolon.
1005             validateGroupMailbox(tokens);
1006 
1007             token = tokens.nextRealToken();
1008 
1009             // we're at the end of the group.  Make sure this is truely the end.
1010             if (token.type == SEMICOLON) {
1011                 token = tokens.nextRealToken();
1012                 if (token.type != END_OF_TOKENS) {
1013                     illegalAddress("Illegal group address", token);
1014                 }
1015                 return;
1016             }
1017 
1018             // if not a semicolon, this better be a comma.
1019             else if (token.type != COMMA) {
1020                 illegalAddress("Illegal group address", token);
1021             }
1022         }
1023     }
1024 
1025 
1026     /**
1027      * Validate the syntax of single mailbox within a group address.
1028      *
1029      * @param tokens The stream of tokens representing the address.
1030      *
1031      * @exception AddressException
1032      */
1033     private void validateGroupMailbox(TokenStream tokens) throws AddressException {
1034         AddressToken first = tokens.nextRealToken();
1035         // is this just a null address in the list?  then push the terminator back and return.
1036         if (first.type == COMMA || first.type == SEMICOLON) {
1037             tokens.pushToken(first);
1038             return;
1039         }
1040 
1041         // now we need to scan ahead to see if we can determine the type.
1042         AddressToken token = first;
1043 
1044 
1045         // we need to scan forward to figure out what sort of address this is.
1046         while (first != null) {
1047             switch (token.type) {
1048                 // until we know the context, these are all just ignored.
1049                 case QUOTED_LITERAL:
1050                 case ATOM:
1051                     break;
1052 
1053                 // a LEFT_ANGLE indicates we have a full RFC822 mailbox form.  The leading phrase
1054                 // is the personal info.  The address is inside the brackets.
1055                 case LEFT_ANGLE:
1056                     tokens.pushToken(first);
1057                     validatePhrase(tokens, false);
1058                     validateRouteAddr(tokens, true);
1059                     return;
1060 
1061                 // we've hit a period as the first non-word token.  This should be part of a local-part
1062                 // of an address.
1063                 case PERIOD:
1064                 // we've hit an "@" as the first non-word token.  This is probably a simple address in
1065                 // the form "user@domain".
1066                 case AT_SIGN:
1067                     tokens.pushToken(first);
1068                     validateAddressSpec(tokens);
1069                     return;
1070 
1071                 // reached the end of string...this might be a null address, or one of the very simple name
1072                 // forms used for non-strict RFC822 versions.  Reset, and try that form
1073                 case COMMA:
1074                 // this is the end of the group...handle it like a comma for now.
1075                 case SEMICOLON:
1076                     tokens.pushToken(first);
1077                     validateAddressSpec(tokens);
1078                     return;
1079 
1080                 case END_OF_TOKENS:
1081                     illegalAddress("Missing ';'", token);
1082 
1083             }
1084             token = tokens.nextRealToken();
1085         }
1086     }
1087 
1088 
1089     /**
1090      * Utility method for throwing an AddressException caused by an
1091      * unexpected primitive token.
1092      *
1093      * @param token  The token causing the problem (must not be a value type token).
1094      *
1095      * @exception AddressException
1096      */
1097     private void invalidToken(AddressToken token) throws AddressException {
1098         illegalAddress("Unexpected '" + token.type + "'", token);
1099     }
1100 
1101 
1102     /**
1103      * Raise an error about illegal syntax.
1104      *
1105      * @param message  The message used in the thrown exception.
1106      * @param position The parsing position within the string.
1107      *
1108      * @exception AddressException
1109      */
1110     private void syntaxError(String message, int position) throws AddressException
1111     {
1112         throw new AddressException(message, addresses, position);
1113     }
1114 
1115 
1116     /**
1117      * Throw an exception based on the position of an invalid token.
1118      *
1119      * @param message The exception message.
1120      * @param token   The token causing the error.  This tokens position is used
1121      *                in the exception information.
1122      */
1123     private void illegalAddress(String message, AddressToken token) throws AddressException {
1124         throw new AddressException(message, addresses, token.position);
1125     }
1126 
1127 
1128     /**
1129      * Validate that a required phrase exists.
1130      *
1131      * @param tokens   The set of tokens to validate. positioned at the phrase start.
1132      * @param required A flag indicating whether the phrase is optional or required.
1133      *
1134      * @exception AddressException
1135      */
1136     private void validatePhrase(TokenStream tokens, boolean required) throws AddressException {
1137         // we need to have at least one WORD token in the phrase...everything is optional
1138         // after that.
1139         AddressToken token = tokens.nextRealToken();
1140         if (token.type != ATOM && token.type != QUOTED_LITERAL) {
1141             if (required) {
1142                 illegalAddress("Missing group phrase", token);
1143             }
1144         }
1145 
1146         // now scan forward to the end of the phrase
1147         token = tokens.nextRealToken();
1148         while (token.type == ATOM || token.type == QUOTED_LITERAL) {
1149             token = tokens.nextRealToken();
1150         }
1151     }
1152 
1153 
1154     /**
1155      * validate a routeaddr specification
1156      *
1157      * @param tokens  The tokens representing the address portion (personal information
1158      *                already removed).
1159      * @param ingroup true indicates we're validating a route address inside a
1160      *                group list.  false indicates we're validating a standalone
1161      *                address.
1162      *
1163      * @exception AddressException
1164      */
1165     private void validateRouteAddr(TokenStream tokens, boolean ingroup) throws AddressException {
1166         // get the next real token.
1167         AddressToken token = tokens.nextRealToken();
1168         // if this is an at sign, then we have a list of domains to parse.
1169         if (token.type == AT_SIGN) {
1170             // push the marker token back in for the route parser, and step past that part.
1171             tokens.pushToken(token);
1172             validateRoute(tokens);
1173         }
1174         else {
1175             // we need to push this back on to validate the local part.
1176             tokens.pushToken(token);
1177         }
1178 
1179         // now we expect to see an address spec.
1180         validateAddressSpec(tokens);
1181 
1182         token = tokens.nextRealToken();
1183         if (ingroup) {
1184             // if we're validating within a group specification, the angle brackets are still there (and
1185             // required).
1186             if (token.type != RIGHT_ANGLE) {
1187                 illegalAddress("Missing '>'", token);
1188             }
1189         }
1190         else {
1191             // the angle brackets were removed to make this an address, so we should be done.  Make sure we
1192             // have a terminator here.
1193             if (token.type != END_OF_TOKENS) {
1194                 illegalAddress("Illegal Address", token);
1195             }
1196         }
1197     }
1198 
1199 
1200 
1201     /**
1202      * Validate a simple address in the form "user@domain".
1203      *
1204      * @param tokens The stream of tokens representing the address.
1205      */
1206     private void validateSimpleAddress(TokenStream tokens) throws AddressException {
1207 
1208         // the validation routines occur after addresses have been split into
1209         // personal and address forms.  Therefore, our validation begins directly
1210         // with the first token.
1211         validateAddressSpec(tokens);
1212 
1213         // get the next token and see if there is something here...anything but the terminator is an error
1214         AddressToken token = tokens.nextRealToken();
1215         if (token.type != END_OF_TOKENS) {
1216             illegalAddress("Illegal Address", token);
1217         }
1218     }
1219 
1220     /**
1221      * Validate the addr-spec portion of an address.  RFC822 requires
1222      * this be of the form "local-part@domain".  However, javamail also
1223      * allows simple address of the form "local-part".  We only require
1224      * the domain if an '@' is encountered.
1225      *
1226      * @param tokens
1227      */
1228     private void validateAddressSpec(TokenStream tokens) throws AddressException {
1229         // all addresses, even the simple ones, must have at least a local part.
1230         validateLocalPart(tokens);
1231 
1232         // now see if we have a domain portion to look at.
1233         AddressToken token = tokens.nextRealToken();
1234         if (token.type == AT_SIGN) {
1235             validateDomain(tokens);
1236         }
1237         else {
1238             // put this back for termination
1239             tokens.pushToken(token);
1240         }
1241 
1242     }
1243 
1244 
1245     /**
1246      * Validate the route portion of a route-addr.  This is a list
1247      * of domain values in the form 1#("@" domain) ":".
1248      *
1249      * @param tokens The token stream holding the address information.
1250      */
1251     private void validateRoute(TokenStream tokens) throws AddressException {
1252         while (true) {
1253             AddressToken token = tokens.nextRealToken();
1254             // if this is the first part of the list, go parse off a domain
1255             if (token.type == AT_SIGN) {
1256                 validateDomain(tokens);
1257             }
1258             // another element in the list?  Go around again
1259             else if (token.type == COMMA) {
1260                 continue;
1261             }
1262             // the list is terminated by a colon...stop this part of the validation once we hit one.
1263             else if (token.type == COLON) {
1264                 return;
1265             }
1266             // the list is terminated by a colon.  If this isn't one of those, we have an error.
1267             else {
1268                 illegalAddress("Missing ':'", token);
1269             }
1270         }
1271     }
1272 
1273 
1274     /**
1275      * Parse the local part of an address spec.  The local part
1276      * is a series of "words" separated by ".".
1277      */
1278     private void validateLocalPart(TokenStream tokens) throws AddressException {
1279         while (true) {
1280             // get the token.
1281             AddressToken token = tokens.nextRealToken();
1282 
1283             // this must be either an atom or a literal.
1284             if (token.type != ATOM && token.type != QUOTED_LITERAL) {
1285                 illegalAddress("Invalid local part", token);
1286             }
1287 
1288             // get the next token (white space and comments ignored)
1289             token = tokens.nextRealToken();
1290             // if this is a period, we continue parsing
1291             if (token.type != PERIOD) {
1292                 tokens.pushToken(token);
1293                 // return the token
1294                 return;
1295             }
1296         }
1297     }
1298 
1299 
1300 
1301     /**
1302      * Parse a domain name of the form sub-domain *("." sub-domain).
1303      * a sub-domain is either an atom or a domain-literal.
1304      */
1305     private void validateDomain(TokenStream tokens) throws AddressException {
1306         while (true) {
1307             // get the token.
1308             AddressToken token = tokens.nextRealToken();
1309 
1310             // this must be either an atom or a domain literal.
1311             if (token.type != ATOM && token.type != DOMAIN_LITERAL) {
1312                 illegalAddress("Invalid domain", token);
1313             }
1314 
1315             // get the next token (white space is ignored)
1316             token = tokens.nextRealToken();
1317             // if this is a period, we continue parsing
1318             if (token.type != PERIOD) {
1319                 // return the token
1320                 tokens.pushToken(token);
1321                 return;
1322             }
1323         }
1324     }
1325 
1326     /**
1327      * Convert a list of word tokens into a phrase string.  The
1328      * rules for this are a little hard to puzzle out, but there
1329      * is a logic to it.  If the list is empty, the phrase is
1330      * just a null value.
1331      *
1332      * If we have a phrase, then the quoted strings need to
1333      * handled appropriately.  In multi-token phrases, the
1334      * quoted literals are concatenated with the quotes intact,
1335      * regardless of content.  Thus a phrase that comes in like this:
1336      *
1337      * "Geronimo" Apache
1338      *
1339      * gets converted back to the same string.
1340      *
1341      * If there is just a single token in the phrase, AND the token
1342      * is a quoted string AND the string does not contain embedded
1343      * special characters ("\.,@<>()[]:;), then the phrase
1344      * is expressed as an atom.  Thus the literal
1345      *
1346      *    "Geronimo"
1347      *
1348      * becomes
1349      *
1350      *    Geronimo
1351      *
1352      * but
1353      *
1354      *    "(Geronimo)"
1355      *
1356      * remains
1357      *
1358      *    "(Geronimo)"
1359      *
1360      * Note that we're generating a canonical form of the phrase,
1361      * which removes comments and reduces linear whitespace down
1362      * to a single separator token.
1363      *
1364      * @param phrase An array list of phrase tokens (which may be empty).
1365      */
1366     private String personalToString(TokenStream tokens) {
1367 
1368         // no tokens in the stream?  This is a null value.
1369         AddressToken token = tokens.nextToken();
1370 
1371         if (token.type == END_OF_TOKENS) {
1372             return null;
1373         }
1374 
1375         AddressToken next = tokens.nextToken();
1376 
1377         // single element phrases get special treatment.
1378         if (next.type == END_OF_TOKENS) {
1379             // this can be used directly...if it contains special characters, quoting will be
1380             // performed when it's converted to a string value.
1381             return token.value;
1382         }
1383 
1384         // reset to the beginning
1385         tokens.pushToken(token);
1386 
1387         // have at least two tokens,
1388         StringBuffer buffer = new StringBuffer();
1389 
1390         // get the first token.  After the first, we add these as blank delimited values.
1391         token = tokens.nextToken();
1392         addTokenValue(token, buffer);
1393 
1394         token = tokens.nextToken();
1395         while (token.type != END_OF_TOKENS) {
1396             // add a blank separator
1397             buffer.append(' ');
1398             // now add the next tokens value
1399             addTokenValue(token, buffer);
1400             token = tokens.nextToken();
1401         }
1402         // and return the canonicalized value
1403         return buffer.toString();
1404     }
1405 
1406 
1407     /**
1408      * take a canonicalized set of address tokens and reformat it back into a string value,
1409      * inserting whitespace where appropriate.
1410      *
1411      * @param tokens The set of tokens representing the address.
1412      *
1413      * @return The string value of the tokens.
1414      */
1415     private String addressToString(TokenStream tokens) {
1416         StringBuffer buffer = new StringBuffer();
1417 
1418         // this flag controls whether we insert a blank delimiter between tokens as
1419         // we advance through the list.  Blanks are only inserted between consequtive value tokens.
1420         // Initially, this is false, then we flip it to true whenever we add a value token, and
1421         // back to false for any special character token.
1422         boolean spaceRequired = false;
1423 
1424         // we use nextToken rather than nextRealToken(), since we need to process the comments also.
1425         AddressToken token = tokens.nextToken();
1426 
1427         // now add each of the tokens
1428         while (token.type != END_OF_TOKENS) {
1429             switch (token.type) {
1430                 // the word tokens are the only ones where we need to worry about adding
1431                 // whitespace delimiters.
1432                 case ATOM:
1433                 case QUOTED_LITERAL:
1434                     // was the last token also a word?  Insert a blank first.
1435                     if (spaceRequired) {
1436                         buffer.append(' ');
1437                     }
1438                     addTokenValue(token, buffer);
1439                     // let the next iteration know we just added a word to the list.
1440                     spaceRequired = true;
1441                     break;
1442 
1443                 // these special characters are just added in.  The constants for the character types
1444                 // were carefully selected to be the character value in question.  This allows us to
1445                 // just append the value.
1446                 case LEFT_ANGLE:
1447                 case RIGHT_ANGLE:
1448                 case COMMA:
1449                 case COLON:
1450                 case AT_SIGN:
1451                 case SEMICOLON:
1452                 case PERIOD:
1453                     buffer.append((char)token.type);
1454                     // no spaces around specials
1455                     spaceRequired = false;
1456                     break;
1457 
1458                 // Domain literals self delimiting...we can just append them and turn off the space flag.
1459                 case DOMAIN_LITERAL:
1460                     addTokenValue(token, buffer);
1461                     spaceRequired = false;
1462                     break;
1463 
1464                 // Comments are also self delimitin.
1465                 case COMMENT:
1466                     addTokenValue(token, buffer);
1467                     spaceRequired = false;
1468                     break;
1469             }
1470             token = tokens.nextToken();
1471         }
1472         return buffer.toString();
1473     }
1474 
1475 
1476     /**
1477      * Append a value token on to a string buffer used to create
1478      * the canonicalized string value.
1479      *
1480      * @param token  The token we're adding.
1481      * @param buffer The target string buffer.
1482      */
1483     private void addTokenValue(AddressToken token, StringBuffer buffer) {
1484         // atom values can be added directly.
1485         if (token.type == ATOM) {
1486             buffer.append(token.value);
1487         }
1488         // a literal value?  Add this as a quoted string
1489         else if (token.type == QUOTED_LITERAL) {
1490             buffer.append(formatQuotedString(token.value));
1491         }
1492         // could be a domain literal of the form "[value]"
1493         else if (token.type == DOMAIN_LITERAL) {
1494             buffer.append('[');
1495             buffer.append(token.value);
1496             buffer.append(']');
1497         }
1498         // comments also have values
1499         else if (token.type == COMMENT) {
1500             buffer.append('(');
1501             buffer.append(token.value);
1502             buffer.append(')');
1503         }
1504     }
1505 
1506 
1507 
1508     private static final byte[] CHARMAP = {
1509         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  0x06, 0x02, 0x06, 0x02, 0x02, 0x06, 0x02, 0x02,
1510         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
1511         0x04, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,  0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00,
1512         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00,
1513 
1514         0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1515         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00,
1516         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1517         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
1518     };
1519 
1520     private static final byte FLG_SPECIAL = 1;
1521     private static final byte FLG_CONTROL = 2;
1522     private static final byte FLG_SPACE = 4;
1523 
1524     private static boolean isSpace(char ch) {
1525         if (ch > '\u007f') {
1526             return false;
1527         } else {
1528             return (CHARMAP[ch] & FLG_SPACE) != 0;
1529         }
1530     }
1531 
1532     /**
1533      * Quick test to see if a character is an allowed atom character
1534      * or not.
1535      *
1536      * @param ch     The test character.
1537      *
1538      * @return true if this character is allowed in atoms, false for any
1539      *         control characters, special characters, or blanks.
1540      */
1541     public static boolean isAtom(char ch) {
1542         if (ch > '\u007f') {
1543             return false;
1544         }
1545         else if (ch == ' ') {
1546             return false;
1547         }
1548         else {
1549             return (CHARMAP[ch] & (FLG_SPECIAL | FLG_CONTROL)) == 0;
1550         }
1551     }
1552 
1553     /**
1554      * Tests one string to determine if it contains any of the
1555      * characters in a supplied test string.
1556      *
1557      * @param s      The string we're testing.
1558      * @param chars  The set of characters we're testing against.
1559      *
1560      * @return true if any of the characters is found, false otherwise.
1561      */
1562     public static boolean containsCharacters(String s, String chars)
1563     {
1564         for (int i = 0; i < s.length(); i++) {
1565             if (chars.indexOf(s.charAt(i)) >= 0) {
1566                 return true;
1567             }
1568         }
1569         return false;
1570     }
1571 
1572 
1573     /**
1574      * Tests if a string contains any non-special characters that
1575      * would require encoding the value as a quoted string rather
1576      * than a simple atom value.
1577      *
1578      * @param s      The test string.
1579      *
1580      * @return True if the string contains only blanks or allowed atom
1581      *         characters.
1582      */
1583     public static boolean containsSpecials(String s)
1584     {
1585         for (int i = 0; i < s.length(); i++) {
1586             char ch = s.charAt(i);
1587             // must be either a blank or an allowed atom char.
1588             if (ch == ' ' || isAtom(ch)) {
1589                 continue;
1590             }
1591             else {
1592                 return true;
1593             }
1594         }
1595         return false;
1596     }
1597 
1598 
1599     /**
1600      * Tests if a string contains any non-special characters that
1601      * would require encoding the value as a quoted string rather
1602      * than a simple atom value.
1603      *
1604      * @param s      The test string.
1605      *
1606      * @return True if the string contains only blanks or allowed atom
1607      *         characters.
1608      */
1609     public static boolean isAtom(String s)
1610     {
1611         for (int i = 0; i < s.length(); i++) {
1612             char ch = s.charAt(i);
1613             // must be an allowed atom character
1614             if (!isAtom(ch)) {
1615                 return false;
1616             }
1617         }
1618         return true;
1619     }
1620 
1621     /**
1622      * Apply RFC822 quoting rules to a literal string value.  This
1623      * will search the string to see if there are any characters that
1624      * require special escaping, and apply the escapes.  If the
1625      * string is just a string of blank-delimited atoms, the string
1626      * value is returned without quotes.
1627      *
1628      * @param s      The source string.
1629      *
1630      * @return A version of the string as a valid RFC822 quoted literal.
1631      */
1632     public static String quoteString(String s) {
1633 
1634         // only backslash and double quote require escaping.  If the string does not
1635         // contain any of these, then we can just slap on some quotes and go.
1636         if (s.indexOf('\\') == -1 && s.indexOf('"') == -1) {
1637             // if the string is an atom (or a series of blank-delimited atoms), we can just return it directly.
1638             if (!containsSpecials(s)) {
1639                 return s;
1640             }
1641             StringBuffer buffer = new StringBuffer(s.length() + 2);
1642             buffer.append('"');
1643             buffer.append(s);
1644             buffer.append('"');
1645             return buffer.toString();
1646         }
1647 
1648         // get a buffer sufficiently large for the string, two quote characters, and a "reasonable"
1649         // number of escaped values.
1650         StringBuffer buffer = new StringBuffer(s.length() + 10);
1651         buffer.append('"');
1652 
1653         // now check all of the characters.
1654         for (int i = 0; i < s.length(); i++) {
1655             char ch = s.charAt(i);
1656             // character requiring escaping?
1657             if (ch == '\\' || ch == '"') {
1658                 // add an extra backslash
1659                 buffer.append('\\');
1660             }
1661             // and add on the character
1662             buffer.append(ch);
1663         }
1664         buffer.append('"');
1665         return buffer.toString();
1666     }
1667 
1668     /**
1669      * Apply RFC822 quoting rules to a literal string value.  This
1670      * will search the string to see if there are any characters that
1671      * require special escaping, and apply the escapes.  The returned
1672      * value is enclosed in quotes.
1673      *
1674      * @param s      The source string.
1675      *
1676      * @return A version of the string as a valid RFC822 quoted literal.
1677      */
1678     public static String formatQuotedString(String s) {
1679         // only backslash and double quote require escaping.  If the string does not
1680         // contain any of these, then we can just slap on some quotes and go.
1681         if (s.indexOf('\\') == -1 && s.indexOf('"') == -1) {
1682             StringBuffer buffer = new StringBuffer(s.length() + 2);
1683             buffer.append('"');
1684             buffer.append(s);
1685             buffer.append('"');
1686             return buffer.toString();
1687         }
1688 
1689         // get a buffer sufficiently large for the string, two quote characters, and a "reasonable"
1690         // number of escaped values.
1691         StringBuffer buffer = new StringBuffer(s.length() + 10);
1692         buffer.append('"');
1693 
1694         // now check all of the characters.
1695         for (int i = 0; i < s.length(); i++) {
1696             char ch = s.charAt(i);
1697             // character requiring escaping?
1698             if (ch == '\\' || ch == '"') {
1699                 // add an extra backslash
1700                 buffer.append('\\');
1701             }
1702             // and add on the character
1703             buffer.append(ch);
1704         }
1705         buffer.append('"');
1706         return buffer.toString();
1707     }
1708 
1709     public class TokenStream {
1710         // the set of tokens in the parsed address list, as determined by RFC822 syntax rules.
1711         private List tokens;
1712 
1713         // the current token position
1714         int currentToken = 0;
1715 
1716 
1717         /**
1718          * Default constructor for a TokenStream.  This creates an
1719          * empty TokenStream for purposes of tokenizing an address.
1720          * It is the creator's responsibility to terminate the stream
1721          * with a terminator token.
1722          */
1723         public TokenStream() {
1724             tokens = new ArrayList();
1725         }
1726 
1727 
1728         /**
1729          * Construct a TokenStream from a list of tokens.  A terminator
1730          * token is added to the end.
1731          *
1732          * @param tokens An existing token list.
1733          */
1734         public TokenStream(List tokens) {
1735             this.tokens = tokens;
1736             tokens.add(new AddressToken(END_OF_TOKENS, -1));
1737         }
1738 
1739         /**
1740          * Add an address token to the token list.
1741          *
1742          * @param t      The new token to add to the list.
1743          */
1744         public void addToken(AddressToken token) {
1745             tokens.add(token);
1746         }
1747 
1748         /**
1749          * Get the next token at the cursor position, advancing the
1750          * position accordingly.
1751          *
1752          * @return The token at the current token position.
1753          */
1754         public AddressToken nextToken() {
1755             AddressToken token = (AddressToken)tokens.get(currentToken++);
1756             // we skip over white space tokens when operating in this mode, so
1757             // check the token and iterate until we get a non-white space.
1758             while (token.type == WHITESPACE) {
1759                 token = (AddressToken)tokens.get(currentToken++);
1760             }
1761             return token;
1762         }
1763 
1764 
1765         /**
1766          * Get the next token at the cursor position, without advancing the
1767          * position.
1768          *
1769          * @return The token at the current token position.
1770          */
1771         public AddressToken currentToken() {
1772             // return the current token and step the cursor
1773             return (AddressToken)tokens.get(currentToken);
1774         }
1775 
1776 
1777         /**
1778          * Get the next non-comment token from the string.  Comments are ignored, except as personal information
1779          * for very simple address specifications.
1780          *
1781          * @return A token guaranteed not to be a whitespace token.
1782          */
1783         public AddressToken nextRealToken()
1784         {
1785             AddressToken token = nextToken();
1786             if (token.type == COMMENT) {
1787                 token = nextToken();
1788             }
1789             return token;
1790         }
1791 
1792         /**
1793          * Push a token back on to the queue, making the index of this
1794          * token the current cursor position.
1795          *
1796          * @param token  The token to push.
1797          */
1798         public void pushToken(AddressToken token) {
1799             // just reset the cursor to the token's index position.
1800             currentToken = tokenIndex(token);
1801         }
1802 
1803         /**
1804          * Get the next token after a given token, without advancing the
1805          * token position.
1806          *
1807          * @param token  The token we're retrieving a token relative to.
1808          *
1809          * @return The next token in the list.
1810          */
1811         public AddressToken nextToken(AddressToken token) {
1812             return (AddressToken)tokens.get(tokenIndex(token) + 1);
1813         }
1814 
1815 
1816         /**
1817          * Return the token prior to a given token.
1818          *
1819          * @param token  The token used for the index.
1820          *
1821          * @return The token prior to the index token in the list.
1822          */
1823         public AddressToken previousToken(AddressToken token) {
1824             return (AddressToken)tokens.get(tokenIndex(token) - 1);
1825         }
1826 
1827 
1828         /**
1829          * Retrieve a token at a given index position.
1830          *
1831          * @param index  The target index.
1832          */
1833         public AddressToken getToken(int index)
1834         {
1835             return (AddressToken)tokens.get(index);
1836         }
1837 
1838 
1839         /**
1840          * Retrieve the index of a particular token in the stream.
1841          *
1842          * @param token  The target token.
1843          *
1844          * @return The index of the token within the stream.  Returns -1 if this
1845          *         token is somehow not in the stream.
1846          */
1847         public int tokenIndex(AddressToken token) {
1848             return tokens.indexOf(token);
1849         }
1850 
1851 
1852         /**
1853          * Extract a new TokenStream running from the start token to the
1854          * token preceeding the end token.
1855          *
1856          * @param start  The starting token of the section.
1857          * @param end    The last token (+1) for the target section.
1858          *
1859          * @return A new TokenStream object for processing this section of tokens.
1860          */
1861         public TokenStream section(AddressToken start, AddressToken end) {
1862             int startIndex = tokenIndex(start);
1863             int endIndex = tokenIndex(end);
1864 
1865             // List.subList() returns a list backed by the original list.  Since we need to add a
1866             // terminator token to this list when we take the sublist, we need to manually copy the
1867             // references so we don't end up munging the original list.
1868             ArrayList list = new ArrayList(endIndex - startIndex + 2);
1869 
1870             for (int i = startIndex; i <= endIndex; i++) {
1871                 list.add(tokens.get(i));
1872             }
1873             return new TokenStream(list);
1874         }
1875 
1876 
1877         /**
1878          * Reset the token position back to the beginning of the
1879          * stream.
1880          */
1881         public void reset() {
1882             currentToken = 0;
1883         }
1884 
1885         /**
1886          * Scan forward looking for a non-blank token.
1887          *
1888          * @return The first non-blank token in the stream.
1889          */
1890         public AddressToken getNonBlank()
1891         {
1892             AddressToken token = currentToken();
1893             while (token.type == WHITESPACE) {
1894                 currentToken++;
1895                 token = currentToken();
1896             }
1897             return token;
1898         }
1899 
1900 
1901         /**
1902          * Extract a blank delimited token from a TokenStream.  A blank
1903          * delimited token is the set of tokens up to the next real whitespace
1904          * token (comments not included).
1905          *
1906          * @return A TokenStream object with the new set of tokens.
1907          */
1908         public TokenStream getBlankDelimitedToken()
1909         {
1910             // get the next non-whitespace token.
1911             AddressToken first = getNonBlank();
1912             // if this is the end, we return null.
1913             if (first.type == END_OF_TOKENS) {
1914                 return null;
1915             }
1916 
1917             AddressToken last = first;
1918 
1919             // the methods for retrieving tokens skip over whitespace, so we're going to process this
1920             // by index.
1921             currentToken++;
1922 
1923             AddressToken token = currentToken();
1924             while (true) {
1925                 // if this is our marker, then pluck out the section and return it.
1926                 if (token.type == END_OF_TOKENS || token.type == WHITESPACE) {
1927                     return section(first, last);
1928                 }
1929                 last = token;
1930                 currentToken++;
1931                 // we accept any and all tokens here.
1932                 token = currentToken();
1933             }
1934         }
1935 
1936         /**
1937          * Return the index of the current cursor position.
1938          *
1939          * @return The integer index of the current token.
1940          */
1941         public int currentIndex() {
1942             return currentToken;
1943         }
1944 
1945         public void dumpTokens()
1946         {
1947             System.out.println(">>>>>>>>> Start dumping TokenStream tokens");
1948             for (int i = 0; i < tokens.size(); i++) {
1949                 System.out.println("-------- Token: " + tokens.get(i));
1950             }
1951 
1952             System.out.println("++++++++ cursor position=" + currentToken);
1953             System.out.println(">>>>>>>>> End dumping TokenStream tokens");
1954         }
1955     }
1956 
1957 
1958     /**
1959      * Simple utility class for representing address tokens.
1960      */
1961     public class AddressToken {
1962 
1963         // the token type
1964         int type;
1965 
1966         // string value of the token (can be null)
1967         String value;
1968 
1969         // position of the token within the address string.
1970         int position;
1971 
1972         AddressToken(int type, int position)
1973         {
1974             this.type = type;
1975             this.value = null;
1976             this.position = position;
1977         }
1978 
1979         AddressToken(String value, int type, int position)
1980         {
1981             this.type = type;
1982             this.value = value;
1983             this.position = position;
1984         }
1985 
1986         public String toString()
1987         {
1988             if (type == END_OF_TOKENS) {
1989                 return "AddressToken:  type=END_OF_TOKENS";
1990             }
1991             if (value == null) {
1992                 return "AddressToken:  type=" + (char)type;
1993             }
1994             else {
1995                 return "AddressToken:  type=" + (char)type + " value=" + value;
1996             }
1997         }
1998     }
1999 }
2000