1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 package javax.mail.internet;
21
22 import java.io.UnsupportedEncodingException;
23 import java.lang.reflect.Array;
24 import java.util.ArrayList;
25 import java.util.List;
26
27 class AddressParser {
28
29 // the validation strictness levels, from most lenient to most conformant.
30 static public final int NONSTRICT = 0;
31 static public final int PARSE_HEADER = 1;
32 static public final int STRICT = 2;
33
34 // different mailbox types
35 static protected final int UNKNOWN = 0;
36 static protected final int ROUTE_ADDR = 1;
37 static protected final int GROUP_ADDR = 2;
38 static protected final int SIMPLE_ADDR = 3;
39
40 // constants for token types.
41 static protected final int END_OF_TOKENS = '\0';
42 static protected final int PERIOD = '.';
43 static protected final int LEFT_ANGLE = '<';
44 static protected final int RIGHT_ANGLE = '>';
45 static protected final int COMMA = ',';
46 static protected final int AT_SIGN = '@';
47 static protected final int SEMICOLON = ';';
48 static protected final int COLON = ':';
49 static protected final int QUOTED_LITERAL = '"';
50 static protected final int DOMAIN_LITERAL = '[';
51 static protected final int COMMENT = '(';
52 static protected final int ATOM = 'A';
53 static protected final int WHITESPACE = ' ';
54
55
56 // the string we're parsing
57 private String addresses;
58 // the current parsing position
59 private int position;
60 // the end position of the string
61 private int end;
62 // the strictness flag
63 private int validationLevel;
64
65 public AddressParser(String addresses, int validation) {
66 this.addresses = addresses;
67 validationLevel = validation;
68 }
69
70
71 /**
72 * Parse an address list into an array of internet addresses.
73 *
74 * @return An array containing all of the non-null addresses in the list.
75 * @exception AddressException
76 * Thrown for any validation errors.
77 */
78 public InternetAddress[] parseAddressList() throws AddressException
79 {
80 // get the address as a set of tokens we can process.
81 TokenStream tokens = tokenizeAddress();
82
83 // get an array list accumulator.
84 ArrayList addressList = new ArrayList();
85
86 // we process sections of the token stream until we run out of tokens.
87 while (true) {
88 // parse off a single address. Address lists can have null elements,
89 // so this might return a null value. The null value does not get added
90 // to the address accumulator.
91 addressList.addAll(parseSingleAddress(tokens, false));
92 // This token should be either a "," delimiter or a stream terminator. If we're
93 // at the end, time to get out.
94 AddressToken token = tokens.nextToken();
95 if (token.type == END_OF_TOKENS) {
96 break;
97 }
98 }
99
100 return (InternetAddress [])addressList.toArray(new InternetAddress[0]);
101 }
102
103
104 /**
105 * Parse a single internet address. This must be a single address,
106 * not an address list.
107 *
108 * @exception AddressException
109 */
110 public InternetAddress parseAddress() throws AddressException
111 {
112 // get the address as a set of tokens we can process.
113 TokenStream tokens = tokenizeAddress();
114
115 // parse off a single address. Address lists can have null elements,
116 // so this might return a null value. The null value does not get added
117 // to the address accumulator.
118 List addressList = parseSingleAddress(tokens, false);
119 // we must get exactly one address back from this.
120 if (addressList.isEmpty()) {
121 throw new AddressException("Null address", addresses, 0);
122 }
123 // this could be a simple list of blank delimited tokens. Ensure we only got one back.
124 if (addressList.size() > 1) {
125 throw new AddressException("Illegal Address", addresses, 0);
126 }
127
128 // This token must be a stream stream terminator, or we have an error.
129 AddressToken token = tokens.nextToken();
130 if (token.type != END_OF_TOKENS) {
131 illegalAddress("Illegal Address", token);
132 }
133
134 return (InternetAddress)addressList.get(0);
135 }
136
137
138 /**
139 * Validate an internet address. This must be a single address,
140 * not a list of addresses. The address also must not contain
141 * and personal information to be valid.
142 *
143 * @exception AddressException
144 */
145 public void validateAddress() throws AddressException
146 {
147 // get the address as a set of tokens we can process.
148 TokenStream tokens = tokenizeAddress();
149
150 // parse off a single address. Address lists can have null elements,
151 // so this might return a null value. The null value does not get added
152 // to the address accumulator.
153 List addressList = parseSingleAddress(tokens, false);
154 if (addressList.isEmpty()) {
155 throw new AddressException("Null address", addresses, 0);
156 }
157
158 // this could be a simple list of blank delimited tokens. Ensure we only got one back.
159 if (addressList.size() > 1) {
160 throw new AddressException("Illegal Address", addresses, 0);
161 }
162
163 InternetAddress address = (InternetAddress)addressList.get(0);
164
165 // validation occurs on an address that's already been split into personal and address
166 // data.
167 if (address.personal != null) {
168 throw new AddressException("Illegal Address", addresses, 0);
169 }
170 // This token must be a stream stream terminator, or we have an error.
171 AddressToken token = tokens.nextToken();
172 if (token.type != END_OF_TOKENS) {
173 illegalAddress("Illegal Address", token);
174 }
175 }
176
177
178 /**
179 * Extract the set of address from a group Internet specification.
180 *
181 * @return An array containing all of the non-null addresses in the list.
182 * @exception AddressException
183 */
184 public InternetAddress[] extractGroupList() throws AddressException
185 {
186 // get the address as a set of tokens we can process.
187 TokenStream tokens = tokenizeAddress();
188
189 // get an array list accumulator.
190 ArrayList addresses = new ArrayList();
191
192 AddressToken token = tokens.nextToken();
193
194 // scan forward to the ':' that starts the group list. If we don't find one,
195 // this is an exception.
196 while (token.type != COLON) {
197 if (token.type == END_OF_TOKENS) {
198 illegalAddress("Missing ':'", token);
199 }
200 token = tokens.nextToken();
201 }
202
203 // we process sections of the token stream until we run out of tokens.
204 while (true) {
205 // parse off a single address. Address lists can have null elements,
206 // so this might return a null value. The null value does not get added
207 // to the address accumulator.
208 addresses.addAll(parseSingleAddress(tokens, true));
209 // This token should be either a "," delimiter or a group terminator. If we're
210 // at the end, this is an error.
211 token = tokens.nextToken();
212 if (token.type == SEMICOLON) {
213 break;
214 }
215 else if (token.type == END_OF_TOKENS) {
216 illegalAddress("Missing ';'", token);
217 }
218 }
219
220 return (InternetAddress [])addresses.toArray(new InternetAddress[0]);
221 }
222
223
224 /**
225 * Parse out a single address from a string from a string
226 * of address tokens, returning an InternetAddress object that
227 * represents the address.
228 *
229 * @param tokens The token source for this address.
230 *
231 * @return A parsed out and constructed InternetAddress object for
232 * the next address. Returns null if this is an "empty"
233 * address in a list.
234 * @exception AddressException
235 */
236 private List parseSingleAddress(TokenStream tokens, boolean inGroup) throws AddressException
237 {
238 List parsedAddresses = new ArrayList();
239
240 // index markers for personal information
241 AddressToken personalStart = null;
242 AddressToken personalEnd = null;
243
244 // and similar bits for the address information.
245 AddressToken addressStart = null;
246 AddressToken addressEnd = null;
247
248 // there is a fall-back set of rules allowed that will parse the address as a set of blank delimited
249 // tokens. However, we do NOT allow this if we encounter any tokens that fall outside of these
250 // rules. For example, comment fields and quoted strings will disallow the very lenient rule set.
251 boolean nonStrictRules = true;
252
253 // we don't know the type of address yet
254 int addressType = UNKNOWN;
255
256 // the parsing goes in two stages. Stage one runs through the tokens locating the bounds
257 // of the address we're working on, resolving the personal information, and also validating
258 // some of the larger scale syntax features of an address (matched delimiters for routes and
259 // groups, invalid nesting checks, etc.).
260
261 // get the next token from the queue and save this. We're going to scan ahead a bit to
262 // figure out what type of address we're looking at, then reset to do the actually parsing
263 // once we've figured out a form.
264 AddressToken first = tokens.nextToken();
265 // push it back on before starting processing.
266 tokens.pushToken(first);
267
268 // scan ahead for a trigger token that tells us what we've got.
269 while (addressType == UNKNOWN) {
270
271 AddressToken token = tokens.nextToken();
272 switch (token.type) {
273 // skip these for now...after we've processed everything and found that this is a simple
274 // address form, then we'll check for a leading comment token in the first position and use
275 // if as personal information.
276 case COMMENT:
277 // comments do, however, denote that this must be parsed according to RFC822 rules.
278 nonStrictRules = false;
279 break;
280
281 // a semi-colon when processing a group is an address terminator. we need to
282 // process this like a comma then
283 case SEMICOLON:
284 if (inGroup) {
285 // we need to push the terminator back on for the caller to see.
286 tokens.pushToken(token);
287 // if we've not tagged any tokens as being the address beginning, so this must be a
288 // null address.
289 if (addressStart == null) {
290 // just return the empty list from this.
291 return parsedAddresses;
292 }
293 // the end token is the back part.
294 addressEnd = tokens.previousToken(token);
295 // without a '<' for a route addr, we can't distinguish address tokens from personal data.
296 // We'll use a leading comment, if there is one.
297 personalStart = null;
298 // this is just a simple form.
299 addressType = SIMPLE_ADDR;
300 break;
301 }
302
303 // NOTE: The above falls through if this is not a group.
304
305 // any of these tokens are a real token that can be the start of an address. Many of
306 // them are not valid as first tokens in this context, but we flag them later if validation
307 // has been requested. For now, we just mark these as the potential address start.
308 case DOMAIN_LITERAL:
309 case QUOTED_LITERAL:
310 // this set of tokens require fuller RFC822 parsing, so turn off the flag.
311 nonStrictRules = false;
312
313 case ATOM:
314 case AT_SIGN:
315 case PERIOD:
316 // if we're not determined the start of the address yet, then check to see if we
317 // need to consider this the personal start.
318 if (addressStart == null) {
319 if (personalStart == null) {
320 personalStart = token;
321 }
322 // This is the first real token of the address, which at this point can
323 // be either the personal info or the first token of the address. If we hit
324 // an address terminator without encountering either a route trigger or group
325 // trigger, then this is the real address.
326 addressStart = token;
327 }
328 break;
329
330 // a LEFT_ANGLE indicates we have a full RFC822 mailbox form. The leading phrase
331 // is the personal info. The address is inside the brackets.
332 case LEFT_ANGLE:
333 // a route address automatically switches off the blank-delimited token mode.
334 nonStrictRules = false;
335 // this is a route address
336 addressType = ROUTE_ADDR;
337 // the address is placed in the InternetAddress object without the route
338 // brackets, so our start is one past this.
339 addressStart = tokens.nextRealToken();
340 // push this back on the queue so the scanner picks it up properly.
341 tokens.pushToken(addressStart);
342 // make sure we flag the end of the personal section too.
343 if (personalStart != null) {
344 personalEnd = tokens.previousToken(token);
345 }
346 // scan the rest of a route address.
347 addressEnd = scanRouteAddress(tokens, false);
348 break;
349
350 // a COLON indicates this is a group specifier...parse the group.
351 case COLON:
352 // Colons would not be valid in simple lists, so turn it off.
353 nonStrictRules = false;
354 // if we're scanning a group, we shouldn't encounter a ":". This is a
355 // recursion error if found.
356 if (inGroup) {
357 illegalAddress("Nested group element", token);
358 }
359 addressType = GROUP_ADDR;
360 // groups don't have any personal sections.
361 personalStart = null;
362 // our real start was back at the beginning
363 addressStart = first;
364 addressEnd = scanGroupAddress(tokens);
365 break;
366
367 // a semi colon can the same as a comma if we're processing a group.
368
369
370 // reached the end of string...this might be a null address, or one of the very simple name
371 // forms used for non-strict RFC822 versions. Reset, and try that form
372 case END_OF_TOKENS:
373 // if we're scanning a group, we shouldn't encounter an end token. This is an
374 // error if found.
375 if (inGroup) {
376 illegalAddress("Missing ';'", token);
377 }
378
379 // NOTE: fall through from above.
380
381 // this is either a terminator for an address list or a a group terminator.
382 case COMMA:
383 // we need to push the terminator back on for the caller to see.
384 tokens.pushToken(token);
385 // if we've not tagged any tokens as being the address beginning, so this must be a
386 // null address.
387 if (addressStart == null) {
388 // just return the empty list from this.
389 return parsedAddresses;
390 }
391 // the end token is the back part.
392 addressEnd = tokens.previousToken(token);
393 // without a '<' for a route addr, we can't distinguish address tokens from personal data.
394 // We'll use a leading comment, if there is one.
395 personalStart = null;
396 // this is just a simple form.
397 addressType = SIMPLE_ADDR;
398 break;
399
400 // right angle tokens are pushed, because parsing of the bracketing is not necessarily simple.
401 // we need to flag these here.
402 case RIGHT_ANGLE:
403 illegalAddress("Unexpected '>'", token);
404
405 }
406 }
407
408 String personal = null;
409
410 // if we have personal data, then convert it to a string value.
411 if (personalStart != null) {
412 TokenStream personalTokens = tokens.section(personalStart, personalEnd);
413 personal = personalToString(personalTokens);
414 }
415 // if we have a simple address, then check the first token to see if it's a comment. For simple addresses,
416 // we'll accept the first comment token as the personal information.
417 else {
418 if (addressType == SIMPLE_ADDR && first.type == COMMENT) {
419 personal = first.value;
420 }
421 }
422
423 TokenStream addressTokens = tokens.section(addressStart, addressEnd);
424
425 // if this is one of the strictly RFC822 types, then we always validate the address. If this is a
426 // a simple address, then we only validate if strict parsing rules are in effect or we've been asked
427 // to validate.
428 if (validationLevel != PARSE_HEADER) {
429 switch (addressType) {
430 case GROUP_ADDR:
431 validateGroup(addressTokens);
432 break;
433
434 case ROUTE_ADDR:
435 validateRouteAddr(addressTokens, false);
436 break;
437
438 case SIMPLE_ADDR:
439 // this is a conditional validation
440 validateSimpleAddress(addressTokens);
441 break;
442 }
443 }
444
445 // more complex addresses and addresses containing tokens other than just simple addresses
446 // need proper handling.
447 if (validationLevel != NONSTRICT || addressType != SIMPLE_ADDR || !nonStrictRules) {
448 // we might have traversed this already when we validated, so reset the
449 // position before using this again.
450 addressTokens.reset();
451 String address = addressToString(addressTokens);
452
453 // get the parsed out sections as string values.
454 InternetAddress result = new InternetAddress();
455 result.setAddress(address);
456 try {
457 result.setPersonal(personal);
458 } catch (UnsupportedEncodingException e) {
459 }
460 // even though we have a single address, we return this as an array. Simple addresses
461 // can be produce an array of items, so we need to return everything.
462 parsedAddresses.add(result);
463 return parsedAddresses;
464 }
465 else {
466 addressTokens.reset();
467
468 TokenStream nextAddress = addressTokens.getBlankDelimitedToken();
469 while (nextAddress != null) {
470 String address = addressToString(nextAddress);
471 // get the parsed out sections as string values.
472 InternetAddress result = new InternetAddress();
473 result.setAddress(address);
474 parsedAddresses.add(result);
475 nextAddress = addressTokens.getBlankDelimitedToken();
476 }
477 return parsedAddresses;
478 }
479 }
480
481
482 /**
483 * Scan the token stream, parsing off a route addr spec. This
484 * will do some basic syntax validation, but will not actually
485 * validate any of the address information. Comments will be
486 * discarded.
487 *
488 * @param tokens The stream of tokens.
489 *
490 * @return The last token of the route address (the one preceeding the
491 * terminating '>'.
492 */
493 private AddressToken scanRouteAddress(TokenStream tokens, boolean inGroup) throws AddressException {
494 // get the first token and ensure we have something between the "<" and ">".
495 AddressToken token = tokens.nextRealToken();
496 // the last processed non-whitespace token, which is the actual address end once the
497 // right angle bracket is encountered.
498
499 AddressToken previous = null;
500
501 // if this route-addr has route information, the first token after the '<' must be a '@'.
502 // this determines if/where a colon or comma can appear.
503 boolean inRoute = token.type == AT_SIGN;
504
505 // now scan until we reach the terminator. The only validation is done on illegal characters.
506 while (true) {
507 switch (token.type) {
508 // The following tokens are all valid between the brackets, so just skip over them.
509 case ATOM:
510 case QUOTED_LITERAL:
511 case DOMAIN_LITERAL:
512 case PERIOD:
513 case AT_SIGN:
514 break;
515
516 case COLON:
517 // if not processing route information, this is illegal.
518 if (!inRoute) {
519 illegalAddress("Unexpected ':'", token);
520 }
521 // this is the end of the route information, the rules now change.
522 inRoute = false;
523 break;
524
525 case COMMA:
526 // if not processing route information, this is illegal.
527 if (!inRoute) {
528 illegalAddress("Unexpected ','", token);
529 }
530 break;
531
532 case RIGHT_ANGLE:
533 // if previous is null, we've had a route address which is "<>". That's illegal.
534 if (previous == null) {
535 illegalAddress("Illegal address", token);
536 }
537 // step to the next token..this had better be either a comma for another address or
538 // the very end of the address list .
539 token = tokens.nextRealToken();
540 // if we're scanning part of a group, then the allowed terminators are either ',' or ';'.
541 if (inGroup) {
542 if (token.type != COMMA && token.type != SEMICOLON) {
543 illegalAddress("Illegal address", token);
544 }
545 }
546 // a normal address should have either a ',' for a list or the end.
547 else {
548 if (token.type != COMMA && token.type != END_OF_TOKENS) {
549 illegalAddress("Illegal address", token);
550 }
551 }
552 // we need to push the termination token back on.
553 tokens.pushToken(token);
554 // return the previous token as the updated position.
555 return previous;
556
557 case END_OF_TOKENS:
558 illegalAddress("Missing '>'", token);
559
560 // now for the illegal ones in this context.
561 case SEMICOLON:
562 illegalAddress("Unexpected ';'", token);
563
564 case LEFT_ANGLE:
565 illegalAddress("Unexpected '<'", token);
566 }
567 // remember the previous token.
568 previous = token;
569 token = tokens.nextRealToken();
570 }
571 }
572
573
574 /**
575 * Scan the token stream, parsing off a group address. This
576 * will do some basic syntax validation, but will not actually
577 * validate any of the address information. Comments will be
578 * ignored.
579 *
580 * @param tokens The stream of tokens.
581 *
582 * @return The last token of the group address (the terminating ':").
583 */
584 private AddressToken scanGroupAddress(TokenStream tokens) throws AddressException {
585 // A group does not require that there be anything between the ':' and ';". This is
586 // just a group with an empty list.
587 AddressToken token = tokens.nextRealToken();
588
589 // now scan until we reach the terminator. The only validation is done on illegal characters.
590 while (true) {
591 switch (token.type) {
592 // The following tokens are all valid in group addresses, so just skip over them.
593 case ATOM:
594 case QUOTED_LITERAL:
595 case DOMAIN_LITERAL:
596 case PERIOD:
597 case AT_SIGN:
598 case COMMA:
599 break;
600
601 case COLON:
602 illegalAddress("Nested group", token);
603
604 // route address within a group specifier....we need to at least verify the bracket nesting
605 // and higher level syntax of the route.
606 case LEFT_ANGLE:
607 scanRouteAddress(tokens, true);
608 break;
609
610 // the only allowed terminator is the ';'
611 case END_OF_TOKENS:
612 illegalAddress("Missing ';'", token);
613
614 // now for the illegal ones in this context.
615 case SEMICOLON:
616 // verify there's nothing illegal after this.
617 AddressToken next = tokens.nextRealToken();
618 if (next.type != COMMA && next.type != END_OF_TOKENS) {
619 illegalAddress("Illegal address", token);
620 }
621 // don't forget to put this back on...our caller will need it.
622 tokens.pushToken(next);
623 return token;
624
625 case RIGHT_ANGLE:
626 illegalAddress("Unexpected '>'", token);
627 }
628 token = tokens.nextRealToken();
629 }
630 }
631
632
633 /**
634 * Parse the provided internet address into a set of tokens. This
635 * phase only does a syntax check on the tokens. The interpretation
636 * of the tokens is the next phase.
637 *
638 * @exception AddressException
639 */
640 private TokenStream tokenizeAddress() throws AddressException {
641
642 // get a list for the set of tokens
643 TokenStream tokens = new TokenStream();
644
645 end = addresses.length(); // our parsing end marker
646
647 // now scan along the string looking for the special characters in an internet address.
648 while (moreCharacters()) {
649 char ch = currentChar();
650
651 switch (ch) {
652 // start of a comment bit...ignore everything until we hit a closing paren.
653 case '(':
654 scanComment(tokens);
655 break;
656 // a closing paren found outside of normal processing.
657 case ')':
658 syntaxError("Unexpected ')'", position);
659
660
661 // start of a quoted string
662 case '"':
663 scanQuotedLiteral(tokens);
664 break;
665 // domain literal
666 case '[':
667 scanDomainLiteral(tokens);
668 break;
669
670 // a naked closing bracket...not valid except as part of a domain literal.
671 case ']':
672 syntaxError("Unexpected ']'", position);
673
674 // special character delimiters
675 case '<':
676 tokens.addToken(new AddressToken(LEFT_ANGLE, position));
677 nextChar();
678 break;
679
680 // a naked closing bracket...not valid without a starting one, but
681 // we need to handle this in context.
682 case '>':
683 tokens.addToken(new AddressToken(RIGHT_ANGLE, position));
684 nextChar();
685 break;
686 case ':':
687 tokens.addToken(new AddressToken(COLON, position));
688 nextChar();
689 break;
690 case ',':
691 tokens.addToken(new AddressToken(COMMA, position));
692 nextChar();
693 break;
694 case '.':
695 tokens.addToken(new AddressToken(PERIOD, position));
696 nextChar();
697 break;
698 case ';':
699 tokens.addToken(new AddressToken(SEMICOLON, position));
700 nextChar();
701 break;
702 case '@':
703 tokens.addToken(new AddressToken(AT_SIGN, position));
704 nextChar();
705 break;
706
707 // white space characters. These are mostly token delimiters, but there are some relaxed
708 // situations where they get processed, so we need to add a white space token for the first
709 // one we encounter in a span.
710 case ' ':
711 case '\t':
712 case '\r':
713 case '\n':
714 // add a single white space token
715 tokens.addToken(new AddressToken(WHITESPACE, position));
716
717 nextChar();
718 // step over any space characters, leaving us positioned either at the end
719 // or the first
720 while (moreCharacters()) {
721 char nextChar = currentChar();
722 if (nextChar == ' ' || nextChar == '\t' || nextChar == '\r' || nextChar == '\n') {
723 nextChar();
724 }
725 else {
726 break;
727 }
728 }
729 break;
730
731 // potentially an atom...if it starts with an allowed atom character, we
732 // parse out the token, otherwise this is invalid.
733 default:
734 if (ch < 040 || ch >= 0177) {
735 syntaxError("Illegal character in address", position);
736 }
737
738 scanAtom(tokens);
739 break;
740 }
741 }
742
743 // for this end marker, give an end position.
744 tokens.addToken(new AddressToken(END_OF_TOKENS, addresses.length()));
745 return tokens;
746 }
747
748
749 /**
750 * Step to the next character position while parsing.
751 */
752 private void nextChar() {
753 position++;
754 }
755
756
757 /**
758 * Retrieve the character at the current parsing position.
759 *
760 * @return The current character.
761 */
762 private char currentChar() {
763 return addresses.charAt(position);
764 }
765
766 /**
767 * Test if there are more characters left to parse.
768 *
769 * @return True if we've hit the last character, false otherwise.
770 */
771 private boolean moreCharacters() {
772 return position < end;
773 }
774
775
776 /**
777 * Parse a quoted string as specified by the RFC822 specification.
778 *
779 * @param tokens The TokenStream where the parsed out token is added.
780 */
781 private void scanQuotedLiteral(TokenStream tokens) throws AddressException {
782 StringBuffer value = new StringBuffer();
783
784 // save the start position for the token.
785 int startPosition = position;
786 // step over the quote delimiter.
787 nextChar();
788
789 while (moreCharacters()) {
790 char ch = currentChar();
791
792 // is this an escape char?
793 if (ch == '\\') {
794 // step past this, and grab the following character
795 nextChar();
796 if (!moreCharacters()) {
797 syntaxError("Missing '\"'", position);
798 }
799 value.append(currentChar());
800 }
801 // end of the string?
802 else if (ch == '"') {
803 // return the constructed string.
804 tokens.addToken(new AddressToken(value.toString(), QUOTED_LITERAL, position));
805 // step over the close delimiter for the benefit of the next token.
806 nextChar();
807 return;
808 }
809 // the RFC822 spec disallows CR characters.
810 else if (ch == '\r') {
811 syntaxError("Illegal line end in literal", position);
812 }
813 else
814 {
815 value.append(ch);
816 }
817 nextChar();
818 }
819 // missing delimiter
820 syntaxError("Missing '\"'", position);
821 }
822
823
824 /**
825 * Parse a domain literal as specified by the RFC822 specification.
826 *
827 * @param tokens The TokenStream where the parsed out token is added.
828 */
829 private void scanDomainLiteral(TokenStream tokens) throws AddressException {
830 StringBuffer value = new StringBuffer();
831
832 int startPosition = position;
833 // step over the quote delimiter.
834 nextChar();
835
836 while (moreCharacters()) {
837 char ch = currentChar();
838
839 // is this an escape char?
840 if (ch == '\\') {
841 // because domain literals don't get extra escaping, we render them
842 // with the escaped characters intact. Therefore, append the '\' escape
843 // first, then append the escaped character without examination.
844 value.append(currentChar());
845 // step past this, and grab the following character
846 nextChar();
847 if (!moreCharacters()) {
848 syntaxError("Missing '\"'", position);
849 }
850 value.append(currentChar());
851 }
852 // end of the string?
853 else if (ch == ']') {
854 // return the constructed string.
855 tokens.addToken(new AddressToken(value.toString(), DOMAIN_LITERAL, startPosition));
856 // step over the close delimiter for the benefit of the next token.
857 nextChar();
858 return;
859 }
860 // the RFC822 spec says no nesting
861 else if (ch == '[') {
862 syntaxError("Unexpected '['", position);
863 }
864 // carriage returns are similarly illegal.
865 else if (ch == '\r') {
866 syntaxError("Illegal line end in domain literal", position);
867 }
868 else
869 {
870 value.append(ch);
871 }
872 nextChar();
873 }
874 // missing delimiter
875 syntaxError("Missing ']'", position);
876 }
877
878 /**
879 * Scan an atom in an internet address, using the RFC822 rules
880 * for atom delimiters.
881 *
882 * @param tokens The TokenStream where the parsed out token is added.
883 */
884 private void scanAtom(TokenStream tokens) throws AddressException {
885 int start = position;
886 nextChar();
887 while (moreCharacters()) {
888
889 char ch = currentChar();
890 if (isAtom(ch)) {
891 nextChar();
892 }
893 else {
894 break;
895 }
896 }
897
898 // return the scanned part of the string.
899 tokens.addToken(new AddressToken(addresses.substring(start, position), ATOM, start));
900 }
901
902
903 /**
904 * Parse an internet address comment field as specified by
905 * RFC822. Includes support for quoted characters and nesting.
906 *
907 * @param tokens The TokenStream where the parsed out token is added.
908 */
909 private void scanComment(TokenStream tokens) throws AddressException {
910 StringBuffer value = new StringBuffer();
911
912 int startPosition = position;
913 // step past the start character
914 nextChar();
915
916 // we're at the top nesting level on the comment.
917 int nest = 1;
918
919 // scan while we have more characters.
920 while (moreCharacters()) {
921 char ch = currentChar();
922 // escape character?
923 if (ch == '\\') {
924 // step over this...if escaped, we must have at least one more character
925 // in the string.
926 nextChar();
927 if (!moreCharacters()) {
928 syntaxError("Missing ')'", position);
929 }
930 value.append(currentChar());
931 }
932 // nested comment?
933 else if (ch == '(') {
934 // step the nesting level...we treat the comment as a single unit, with the delimiters
935 // for the nested comments embedded in the middle
936 nest++;
937 value.append(ch);
938 }
939 // is this the comment close?
940 else if (ch == ')') {
941 // reduce the nesting level. If we still have more to process, add the delimiter character
942 // and keep going.
943 nest--;
944 if (nest > 0) {
945 value.append(ch);
946 }
947 else {
948 // step past this and return. The outermost comment delimiter is not included in
949 // the string value, since this is frequently used as personal data on the
950 // InternetAddress objects.
951 nextChar();
952 tokens.addToken(new AddressToken(value.toString(), COMMENT, startPosition));
953 return;
954 }
955 }
956 else if (ch == '\r') {
957 syntaxError("Illegal line end in comment", position);
958 }
959 else {
960 value.append(ch);
961 }
962 // step to the next character.
963 nextChar();
964 }
965 // ran out of data before seeing the closing bit, not good
966 syntaxError("Missing ')'", position);
967 }
968
969
970 /**
971 * Validate the syntax of an RFC822 group internet address specification.
972 *
973 * @param tokens The stream of tokens for the address.
974 *
975 * @exception AddressException
976 */
977 private void validateGroup(TokenStream tokens) throws AddressException {
978 // we know already this is an address in the form "phrase:group;". Now we need to validate the
979 // elements.
980
981 int phraseCount = 0;
982
983 AddressToken token = tokens.nextRealToken();
984 // now scan to the semi color, ensuring we have only word or comment tokens.
985 while (token.type != COLON) {
986 // only these tokens are allowed here.
987 if (token.type != ATOM && token.type != QUOTED_LITERAL) {
988 invalidToken(token);
989 }
990 phraseCount++;
991 token = tokens.nextRealToken();
992 }
993
994
995 // RFC822 groups require a leading phrase in group specifiers.
996 if (phraseCount == 0) {
997 illegalAddress("Missing group identifier phrase", token);
998 }
999
1000 // now we do the remainder of the parsing using the initial phrase list as the sink...the entire
1001 // address will be converted to a string later.
1002
1003 // ok, we only know this has been valid up to the ":", now we have some real checks to perform.
1004 while (true) {
1005 // go scan off a mailbox. if everything goes according to plan, we should be positioned at either
1006 // a comma or a semicolon.
1007 validateGroupMailbox(tokens);
1008
1009 token = tokens.nextRealToken();
1010
1011 // we're at the end of the group. Make sure this is truely the end.
1012 if (token.type == SEMICOLON) {
1013 token = tokens.nextRealToken();
1014 if (token.type != END_OF_TOKENS) {
1015 illegalAddress("Illegal group address", token);
1016 }
1017 return;
1018 }
1019
1020 // if not a semicolon, this better be a comma.
1021 else if (token.type != COMMA) {
1022 illegalAddress("Illegal group address", token);
1023 }
1024 }
1025 }
1026
1027
1028 /**
1029 * Validate the syntax of single mailbox within a group address.
1030 *
1031 * @param tokens The stream of tokens representing the address.
1032 *
1033 * @exception AddressException
1034 */
1035 private void validateGroupMailbox(TokenStream tokens) throws AddressException {
1036 AddressToken first = tokens.nextRealToken();
1037 // is this just a null address in the list? then push the terminator back and return.
1038 if (first.type == COMMA || first.type == SEMICOLON) {
1039 tokens.pushToken(first);
1040 return;
1041 }
1042
1043 // now we need to scan ahead to see if we can determine the type.
1044 AddressToken token = first;
1045
1046
1047 // we need to scan forward to figure out what sort of address this is.
1048 while (first != null) {
1049 switch (token.type) {
1050 // until we know the context, these are all just ignored.
1051 case QUOTED_LITERAL:
1052 case ATOM:
1053 break;
1054
1055 // a LEFT_ANGLE indicates we have a full RFC822 mailbox form. The leading phrase
1056 // is the personal info. The address is inside the brackets.
1057 case LEFT_ANGLE:
1058 tokens.pushToken(first);
1059 validatePhrase(tokens, false);
1060 validateRouteAddr(tokens, true);
1061 return;
1062
1063 // we've hit a period as the first non-word token. This should be part of a local-part
1064 // of an address.
1065 case PERIOD:
1066 // we've hit an "@" as the first non-word token. This is probably a simple address in
1067 // the form "user@domain".
1068 case AT_SIGN:
1069 tokens.pushToken(first);
1070 validateAddressSpec(tokens);
1071 return;
1072
1073 // reached the end of string...this might be a null address, or one of the very simple name
1074 // forms used for non-strict RFC822 versions. Reset, and try that form
1075 case COMMA:
1076 // this is the end of the group...handle it like a comma for now.
1077 case SEMICOLON:
1078 tokens.pushToken(first);
1079 validateAddressSpec(tokens);
1080 return;
1081
1082 case END_OF_TOKENS:
1083 illegalAddress("Missing ';'", token);
1084
1085 }
1086 token = tokens.nextRealToken();
1087 }
1088 }
1089
1090
1091 /**
1092 * Utility method for throwing an AddressException caused by an
1093 * unexpected primitive token.
1094 *
1095 * @param token The token causing the problem (must not be a value type token).
1096 *
1097 * @exception AddressException
1098 */
1099 private void invalidToken(AddressToken token) throws AddressException {
1100 illegalAddress("Unexpected '" + token.type + "'", token);
1101 }
1102
1103
1104 /**
1105 * Raise an error about illegal syntax.
1106 *
1107 * @param message The message used in the thrown exception.
1108 * @param position The parsing position within the string.
1109 *
1110 * @exception AddressException
1111 */
1112 private void syntaxError(String message, int position) throws AddressException
1113 {
1114 throw new AddressException(message, addresses, position);
1115 }
1116
1117
1118 /**
1119 * Throw an exception based on the position of an invalid token.
1120 *
1121 * @param message The exception message.
1122 * @param token The token causing the error. This tokens position is used
1123 * in the exception information.
1124 */
1125 private void illegalAddress(String message, AddressToken token) throws AddressException {
1126 throw new AddressException(message, addresses, token.position);
1127 }
1128
1129
1130 /**
1131 * Validate that a required phrase exists.
1132 *
1133 * @param tokens The set of tokens to validate. positioned at the phrase start.
1134 * @param required A flag indicating whether the phrase is optional or required.
1135 *
1136 * @exception AddressException
1137 */
1138 private void validatePhrase(TokenStream tokens, boolean required) throws AddressException {
1139 // we need to have at least one WORD token in the phrase...everything is optional
1140 // after that.
1141 AddressToken token = tokens.nextRealToken();
1142 if (token.type != ATOM && token.type != QUOTED_LITERAL) {
1143 if (required) {
1144 illegalAddress("Missing group phrase", token);
1145 }
1146 }
1147
1148 // now scan forward to the end of the phrase
1149 token = tokens.nextRealToken();
1150 while (token.type == ATOM || token.type == QUOTED_LITERAL) {
1151 token = tokens.nextRealToken();
1152 }
1153 }
1154
1155
1156 /**
1157 * validate a routeaddr specification
1158 *
1159 * @param tokens The tokens representing the address portion (personal information
1160 * already removed).
1161 * @param ingroup true indicates we're validating a route address inside a
1162 * group list. false indicates we're validating a standalone
1163 * address.
1164 *
1165 * @exception AddressException
1166 */
1167 private void validateRouteAddr(TokenStream tokens, boolean ingroup) throws AddressException {
1168 // get the next real token.
1169 AddressToken token = tokens.nextRealToken();
1170 // if this is an at sign, then we have a list of domains to parse.
1171 if (token.type == AT_SIGN) {
1172 // push the marker token back in for the route parser, and step past that part.
1173 tokens.pushToken(token);
1174 validateRoute(tokens);
1175 }
1176 else {
1177 // we need to push this back on to validate the local part.
1178 tokens.pushToken(token);
1179 }
1180
1181 // now we expect to see an address spec.
1182 validateAddressSpec(tokens);
1183
1184 token = tokens.nextRealToken();
1185 if (ingroup) {
1186 // if we're validating within a group specification, the angle brackets are still there (and
1187 // required).
1188 if (token.type != RIGHT_ANGLE) {
1189 illegalAddress("Missing '>'", token);
1190 }
1191 }
1192 else {
1193 // the angle brackets were removed to make this an address, so we should be done. Make sure we
1194 // have a terminator here.
1195 if (token.type != END_OF_TOKENS) {
1196 illegalAddress("Illegal Address", token);
1197 }
1198 }
1199 }
1200
1201
1202
1203 /**
1204 * Validate a simple address in the form "user@domain".
1205 *
1206 * @param tokens The stream of tokens representing the address.
1207 */
1208 private void validateSimpleAddress(TokenStream tokens) throws AddressException {
1209
1210 // the validation routines occur after addresses have been split into
1211 // personal and address forms. Therefore, our validation begins directly
1212 // with the first token.
1213 validateAddressSpec(tokens);
1214
1215 // get the next token and see if there is something here...anything but the terminator is an error
1216 AddressToken token = tokens.nextRealToken();
1217 if (token.type != END_OF_TOKENS) {
1218 illegalAddress("Illegal Address", token);
1219 }
1220 }
1221
1222 /**
1223 * Validate the addr-spec portion of an address. RFC822 requires
1224 * this be of the form "local-part@domain". However, javamail also
1225 * allows simple address of the form "local-part". We only require
1226 * the domain if an '@' is encountered.
1227 *
1228 * @param tokens
1229 */
1230 private void validateAddressSpec(TokenStream tokens) throws AddressException {
1231 // all addresses, even the simple ones, must have at least a local part.
1232 validateLocalPart(tokens);
1233
1234 // now see if we have a domain portion to look at.
1235 AddressToken token = tokens.nextRealToken();
1236 if (token.type == AT_SIGN) {
1237 validateDomain(tokens);
1238 }
1239 else {
1240 // put this back for termination
1241 tokens.pushToken(token);
1242 }
1243
1244 }
1245
1246
1247 /**
1248 * Validate the route portion of a route-addr. This is a list
1249 * of domain values in the form 1#("@" domain) ":".
1250 *
1251 * @param tokens The token stream holding the address information.
1252 */
1253 private void validateRoute(TokenStream tokens) throws AddressException {
1254 while (true) {
1255 AddressToken token = tokens.nextRealToken();
1256 // if this is the first part of the list, go parse off a domain
1257 if (token.type == AT_SIGN) {
1258 validateDomain(tokens);
1259 }
1260 // another element in the list? Go around again
1261 else if (token.type == COMMA) {
1262 continue;
1263 }
1264 // the list is terminated by a colon...stop this part of the validation once we hit one.
1265 else if (token.type == COLON) {
1266 return;
1267 }
1268 // the list is terminated by a colon. If this isn't one of those, we have an error.
1269 else {
1270 illegalAddress("Missing ':'", token);
1271 }
1272 }
1273 }
1274
1275
1276 /**
1277 * Parse the local part of an address spec. The local part
1278 * is a series of "words" separated by ".".
1279 */
1280 private void validateLocalPart(TokenStream tokens) throws AddressException {
1281 while (true) {
1282 // get the token.
1283 AddressToken token = tokens.nextRealToken();
1284
1285 // this must be either an atom or a literal.
1286 if (token.type != ATOM && token.type != QUOTED_LITERAL) {
1287 illegalAddress("Invalid local part", token);
1288 }
1289
1290 // get the next token (white space and comments ignored)
1291 token = tokens.nextRealToken();
1292 // if this is a period, we continue parsing
1293 if (token.type != PERIOD) {
1294 tokens.pushToken(token);
1295 // return the token
1296 return;
1297 }
1298 }
1299 }
1300
1301
1302
1303 /**
1304 * Parse a domain name of the form sub-domain *("." sub-domain).
1305 * a sub-domain is either an atom or a domain-literal.
1306 */
1307 private void validateDomain(TokenStream tokens) throws AddressException {
1308 while (true) {
1309 // get the token.
1310 AddressToken token = tokens.nextRealToken();
1311
1312 // this must be either an atom or a domain literal.
1313 if (token.type != ATOM && token.type != DOMAIN_LITERAL) {
1314 illegalAddress("Invalid domain", token);
1315 }
1316
1317 // get the next token (white space is ignored)
1318 token = tokens.nextRealToken();
1319 // if this is a period, we continue parsing
1320 if (token.type != PERIOD) {
1321 // return the token
1322 tokens.pushToken(token);
1323 return;
1324 }
1325 }
1326 }
1327
1328 /**
1329 * Convert a list of word tokens into a phrase string. The
1330 * rules for this are a little hard to puzzle out, but there
1331 * is a logic to it. If the list is empty, the phrase is
1332 * just a null value.
1333 *
1334 * If we have a phrase, then the quoted strings need to
1335 * handled appropriately. In multi-token phrases, the
1336 * quoted literals are concatenated with the quotes intact,
1337 * regardless of content. Thus a phrase that comes in like this:
1338 *
1339 * "Geronimo" Apache
1340 *
1341 * gets converted back to the same string.
1342 *
1343 * If there is just a single token in the phrase, AND the token
1344 * is a quoted string AND the string does not contain embedded
1345 * special characters ("\.,@<>()[]:;), then the phrase
1346 * is expressed as an atom. Thus the literal
1347 *
1348 * "Geronimo"
1349 *
1350 * becomes
1351 *
1352 * Geronimo
1353 *
1354 * but
1355 *
1356 * "(Geronimo)"
1357 *
1358 * remains
1359 *
1360 * "(Geronimo)"
1361 *
1362 * Note that we're generating a canonical form of the phrase,
1363 * which removes comments and reduces linear whitespace down
1364 * to a single separator token.
1365 *
1366 * @param phrase An array list of phrase tokens (which may be empty).
1367 */
1368 private String personalToString(TokenStream tokens) {
1369
1370 // no tokens in the stream? This is a null value.
1371 AddressToken token = tokens.nextToken();
1372
1373 if (token.type == END_OF_TOKENS) {
1374 return null;
1375 }
1376
1377 AddressToken next = tokens.nextToken();
1378
1379 // single element phrases get special treatment.
1380 if (next.type == END_OF_TOKENS) {
1381 // this can be used directly...if it contains special characters, quoting will be
1382 // performed when it's converted to a string value.
1383 return token.value;
1384 }
1385
1386 // reset to the beginning
1387 tokens.pushToken(token);
1388
1389 // have at least two tokens,
1390 StringBuffer buffer = new StringBuffer();
1391
1392 // get the first token. After the first, we add these as blank delimited values.
1393 token = tokens.nextToken();
1394 addTokenValue(token, buffer);
1395
1396 token = tokens.nextToken();
1397 while (token.type != END_OF_TOKENS) {
1398 // add a blank separator
1399 buffer.append(' ');
1400 // now add the next tokens value
1401 addTokenValue(token, buffer);
1402 token = tokens.nextToken();
1403 }
1404 // and return the canonicalized value
1405 return buffer.toString();
1406 }
1407
1408
1409 /**
1410 * take a canonicalized set of address tokens and reformat it back into a string value,
1411 * inserting whitespace where appropriate.
1412 *
1413 * @param tokens The set of tokens representing the address.
1414 *
1415 * @return The string value of the tokens.
1416 */
1417 private String addressToString(TokenStream tokens) {
1418 StringBuffer buffer = new StringBuffer();
1419
1420 // this flag controls whether we insert a blank delimiter between tokens as
1421 // we advance through the list. Blanks are only inserted between consequtive value tokens.
1422 // Initially, this is false, then we flip it to true whenever we add a value token, and
1423 // back to false for any special character token.
1424 boolean spaceRequired = false;
1425
1426 // we use nextToken rather than nextRealToken(), since we need to process the comments also.
1427 AddressToken token = tokens.nextToken();
1428
1429 // now add each of the tokens
1430 while (token.type != END_OF_TOKENS) {
1431 switch (token.type) {
1432 // the word tokens are the only ones where we need to worry about adding
1433 // whitespace delimiters.
1434 case ATOM:
1435 case QUOTED_LITERAL:
1436 // was the last token also a word? Insert a blank first.
1437 if (spaceRequired) {
1438 buffer.append(' ');
1439 }
1440 addTokenValue(token, buffer);
1441 // let the next iteration know we just added a word to the list.
1442 spaceRequired = true;
1443 break;
1444
1445 // these special characters are just added in. The constants for the character types
1446 // were carefully selected to be the character value in question. This allows us to
1447 // just append the value.
1448 case LEFT_ANGLE:
1449 case RIGHT_ANGLE:
1450 case COMMA:
1451 case COLON:
1452 case AT_SIGN:
1453 case SEMICOLON:
1454 case PERIOD:
1455 buffer.append((char)token.type);
1456 // no spaces around specials
1457 spaceRequired = false;
1458 break;
1459
1460 // Domain literals self delimiting...we can just append them and turn off the space flag.
1461 case DOMAIN_LITERAL:
1462 addTokenValue(token, buffer);
1463 spaceRequired = false;
1464 break;
1465
1466 // Comments are also self delimitin.
1467 case COMMENT:
1468 addTokenValue(token, buffer);
1469 spaceRequired = false;
1470 break;
1471 }
1472 token = tokens.nextToken();
1473 }
1474 return buffer.toString();
1475 }
1476
1477
1478 /**
1479 * Append a value token on to a string buffer used to create
1480 * the canonicalized string value.
1481 *
1482 * @param token The token we're adding.
1483 * @param buffer The target string buffer.
1484 */
1485 private void addTokenValue(AddressToken token, StringBuffer buffer) {
1486 // atom values can be added directly.
1487 if (token.type == ATOM) {
1488 buffer.append(token.value);
1489 }
1490 // a literal value? Add this as a quoted string
1491 else if (token.type == QUOTED_LITERAL) {
1492 buffer.append(formatQuotedString(token.value));
1493 }
1494 // could be a domain literal of the form "[value]"
1495 else if (token.type == DOMAIN_LITERAL) {
1496 buffer.append('[');
1497 buffer.append(token.value);
1498 buffer.append(']');
1499 }
1500 // comments also have values
1501 else if (token.type == COMMENT) {
1502 buffer.append('(');
1503 buffer.append(token.value);
1504 buffer.append(')');
1505 }
1506 }
1507
1508
1509
1510 private static final byte[] CHARMAP = {
1511 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x06, 0x02, 0x06, 0x02, 0x02, 0x06, 0x02, 0x02,
1512 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
1513 0x04, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00,
1514 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00,
1515
1516 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1517 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00,
1518 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1519 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
1520 };
1521
1522 private static final byte FLG_SPECIAL = 1;
1523 private static final byte FLG_CONTROL = 2;
1524 private static final byte FLG_SPACE = 4;
1525
1526 private static boolean isSpace(char ch) {
1527 if (ch > '\u007f') {
1528 return false;
1529 } else {
1530 return (CHARMAP[ch] & FLG_SPACE) != 0;
1531 }
1532 }
1533
1534 /**
1535 * Quick test to see if a character is an allowed atom character
1536 * or not.
1537 *
1538 * @param ch The test character.
1539 *
1540 * @return true if this character is allowed in atoms, false for any
1541 * control characters, special characters, or blanks.
1542 */
1543 public static boolean isAtom(char ch) {
1544 if (ch > '\u007f') {
1545 return false;
1546 }
1547 else if (ch == ' ') {
1548 return false;
1549 }
1550 else {
1551 return (CHARMAP[ch] & (FLG_SPECIAL | FLG_CONTROL)) == 0;
1552 }
1553 }
1554
1555 /**
1556 * Tests one string to determine if it contains any of the
1557 * characters in a supplied test string.
1558 *
1559 * @param s The string we're testing.
1560 * @param chars The set of characters we're testing against.
1561 *
1562 * @return true if any of the characters is found, false otherwise.
1563 */
1564 public static boolean containsCharacters(String s, String chars)
1565 {
1566 for (int i = 0; i < s.length(); i++) {
1567 if (chars.indexOf(s.charAt(i)) >= 0) {
1568 return true;
1569 }
1570 }
1571 return false;
1572 }
1573
1574
1575 /**
1576 * Tests if a string contains any non-special characters that
1577 * would require encoding the value as a quoted string rather
1578 * than a simple atom value.
1579 *
1580 * @param s The test string.
1581 *
1582 * @return True if the string contains only blanks or allowed atom
1583 * characters.
1584 */
1585 public static boolean containsSpecials(String s)
1586 {
1587 for (int i = 0; i < s.length(); i++) {
1588 char ch = s.charAt(i);
1589 // must be either a blank or an allowed atom char.
1590 if (ch == ' ' || isAtom(ch)) {
1591 continue;
1592 }
1593 else {
1594 return true;
1595 }
1596 }
1597 return false;
1598 }
1599
1600
1601 /**
1602 * Tests if a string contains any non-special characters that
1603 * would require encoding the value as a quoted string rather
1604 * than a simple atom value.
1605 *
1606 * @param s The test string.
1607 *
1608 * @return True if the string contains only blanks or allowed atom
1609 * characters.
1610 */
1611 public static boolean isAtom(String s)
1612 {
1613 for (int i = 0; i < s.length(); i++) {
1614 char ch = s.charAt(i);
1615 // must be an allowed atom character
1616 if (!isAtom(ch)) {
1617 return false;
1618 }
1619 }
1620 return true;
1621 }
1622
1623 /**
1624 * Apply RFC822 quoting rules to a literal string value. This
1625 * will search the string to see if there are any characters that
1626 * require special escaping, and apply the escapes. If the
1627 * string is just a string of blank-delimited atoms, the string
1628 * value is returned without quotes.
1629 *
1630 * @param s The source string.
1631 *
1632 * @return A version of the string as a valid RFC822 quoted literal.
1633 */
1634 public static String quoteString(String s) {
1635
1636 // only backslash and double quote require escaping. If the string does not
1637 // contain any of these, then we can just slap on some quotes and go.
1638 if (s.indexOf('\\') == -1 && s.indexOf('"') == -1) {
1639 // if the string is an atom (or a series of blank-delimited atoms), we can just return it directly.
1640 if (!containsSpecials(s)) {
1641 return s;
1642 }
1643 StringBuffer buffer = new StringBuffer(s.length() + 2);
1644 buffer.append('"');
1645 buffer.append(s);
1646 buffer.append('"');
1647 return buffer.toString();
1648 }
1649
1650 // get a buffer sufficiently large for the string, two quote characters, and a "reasonable"
1651 // number of escaped values.
1652 StringBuffer buffer = new StringBuffer(s.length() + 10);
1653 buffer.append('"');
1654
1655 // now check all of the characters.
1656 for (int i = 0; i < s.length(); i++) {
1657 char ch = s.charAt(i);
1658 // character requiring escaping?
1659 if (ch == '\\' || ch == '"') {
1660 // add an extra backslash
1661 buffer.append('\\');
1662 }
1663 // and add on the character
1664 buffer.append(ch);
1665 }
1666 buffer.append('"');
1667 return buffer.toString();
1668 }
1669
1670 /**
1671 * Apply RFC822 quoting rules to a literal string value. This
1672 * will search the string to see if there are any characters that
1673 * require special escaping, and apply the escapes. The returned
1674 * value is enclosed in quotes.
1675 *
1676 * @param s The source string.
1677 *
1678 * @return A version of the string as a valid RFC822 quoted literal.
1679 */
1680 public static String formatQuotedString(String s) {
1681 // only backslash and double quote require escaping. If the string does not
1682 // contain any of these, then we can just slap on some quotes and go.
1683 if (s.indexOf('\\') == -1 && s.indexOf('"') == -1) {
1684 StringBuffer buffer = new StringBuffer(s.length() + 2);
1685 buffer.append('"');
1686 buffer.append(s);
1687 buffer.append('"');
1688 return buffer.toString();
1689 }
1690
1691 // get a buffer sufficiently large for the string, two quote characters, and a "reasonable"
1692 // number of escaped values.
1693 StringBuffer buffer = new StringBuffer(s.length() + 10);
1694 buffer.append('"');
1695
1696 // now check all of the characters.
1697 for (int i = 0; i < s.length(); i++) {
1698 char ch = s.charAt(i);
1699 // character requiring escaping?
1700 if (ch == '\\' || ch == '"') {
1701 // add an extra backslash
1702 buffer.append('\\');
1703 }
1704 // and add on the character
1705 buffer.append(ch);
1706 }
1707 buffer.append('"');
1708 return buffer.toString();
1709 }
1710
1711 public class TokenStream {
1712 // the set of tokens in the parsed address list, as determined by RFC822 syntax rules.
1713 private List tokens;
1714
1715 // the current token position
1716 int currentToken = 0;
1717
1718
1719 /**
1720 * Default constructor for a TokenStream. This creates an
1721 * empty TokenStream for purposes of tokenizing an address.
1722 * It is the creator's responsibility to terminate the stream
1723 * with a terminator token.
1724 */
1725 public TokenStream() {
1726 tokens = new ArrayList();
1727 }
1728
1729
1730 /**
1731 * Construct a TokenStream from a list of tokens. A terminator
1732 * token is added to the end.
1733 *
1734 * @param tokens An existing token list.
1735 */
1736 public TokenStream(List tokens) {
1737 this.tokens = tokens;
1738 tokens.add(new AddressToken(END_OF_TOKENS, -1));
1739 }
1740
1741 /**
1742 * Add an address token to the token list.
1743 *
1744 * @param t The new token to add to the list.
1745 */
1746 public void addToken(AddressToken token) {
1747 tokens.add(token);
1748 }
1749
1750 /**
1751 * Get the next token at the cursor position, advancing the
1752 * position accordingly.
1753 *
1754 * @return The token at the current token position.
1755 */
1756 public AddressToken nextToken() {
1757 AddressToken token = (AddressToken)tokens.get(currentToken++);
1758 // we skip over white space tokens when operating in this mode, so
1759 // check the token and iterate until we get a non-white space.
1760 while (token.type == WHITESPACE) {
1761 token = (AddressToken)tokens.get(currentToken++);
1762 }
1763 return token;
1764 }
1765
1766
1767 /**
1768 * Get the next token at the cursor position, without advancing the
1769 * position.
1770 *
1771 * @return The token at the current token position.
1772 */
1773 public AddressToken currentToken() {
1774 // return the current token and step the cursor
1775 return (AddressToken)tokens.get(currentToken);
1776 }
1777
1778
1779 /**
1780 * Get the next non-comment token from the string. Comments are ignored, except as personal information
1781 * for very simple address specifications.
1782 *
1783 * @return A token guaranteed not to be a whitespace token.
1784 */
1785 public AddressToken nextRealToken()
1786 {
1787 AddressToken token = nextToken();
1788 if (token.type == COMMENT) {
1789 token = nextToken();
1790 }
1791 return token;
1792 }
1793
1794 /**
1795 * Push a token back on to the queue, making the index of this
1796 * token the current cursor position.
1797 *
1798 * @param token The token to push.
1799 */
1800 public void pushToken(AddressToken token) {
1801 // just reset the cursor to the token's index position.
1802 currentToken = tokenIndex(token);
1803 }
1804
1805 /**
1806 * Get the next token after a given token, without advancing the
1807 * token position.
1808 *
1809 * @param token The token we're retrieving a token relative to.
1810 *
1811 * @return The next token in the list.
1812 */
1813 public AddressToken nextToken(AddressToken token) {
1814 return (AddressToken)tokens.get(tokenIndex(token) + 1);
1815 }
1816
1817
1818 /**
1819 * Return the token prior to a given token.
1820 *
1821 * @param token The token used for the index.
1822 *
1823 * @return The token prior to the index token in the list.
1824 */
1825 public AddressToken previousToken(AddressToken token) {
1826 return (AddressToken)tokens.get(tokenIndex(token) - 1);
1827 }
1828
1829
1830 /**
1831 * Retrieve a token at a given index position.
1832 *
1833 * @param index The target index.
1834 */
1835 public AddressToken getToken(int index)
1836 {
1837 return (AddressToken)tokens.get(index);
1838 }
1839
1840
1841 /**
1842 * Retrieve the index of a particular token in the stream.
1843 *
1844 * @param token The target token.
1845 *
1846 * @return The index of the token within the stream. Returns -1 if this
1847 * token is somehow not in the stream.
1848 */
1849 public int tokenIndex(AddressToken token) {
1850 return tokens.indexOf(token);
1851 }
1852
1853
1854 /**
1855 * Extract a new TokenStream running from the start token to the
1856 * token preceeding the end token.
1857 *
1858 * @param start The starting token of the section.
1859 * @param end The last token (+1) for the target section.
1860 *
1861 * @return A new TokenStream object for processing this section of tokens.
1862 */
1863 public TokenStream section(AddressToken start, AddressToken end) {
1864 int startIndex = tokenIndex(start);
1865 int endIndex = tokenIndex(end);
1866
1867 // List.subList() returns a list backed by the original list. Since we need to add a
1868 // terminator token to this list when we take the sublist, we need to manually copy the
1869 // references so we don't end up munging the original list.
1870 ArrayList list = new ArrayList(endIndex - startIndex + 2);
1871
1872 for (int i = startIndex; i <= endIndex; i++) {
1873 list.add(tokens.get(i));
1874 }
1875 return new TokenStream(list);
1876 }
1877
1878
1879 /**
1880 * Reset the token position back to the beginning of the
1881 * stream.
1882 */
1883 public void reset() {
1884 currentToken = 0;
1885 }
1886
1887 /**
1888 * Scan forward looking for a non-blank token.
1889 *
1890 * @return The first non-blank token in the stream.
1891 */
1892 public AddressToken getNonBlank()
1893 {
1894 AddressToken token = currentToken();
1895 while (token.type == WHITESPACE) {
1896 currentToken++;
1897 token = currentToken();
1898 }
1899 return token;
1900 }
1901
1902
1903 /**
1904 * Extract a blank delimited token from a TokenStream. A blank
1905 * delimited token is the set of tokens up to the next real whitespace
1906 * token (comments not included).
1907 *
1908 * @return A TokenStream object with the new set of tokens.
1909 */
1910 public TokenStream getBlankDelimitedToken()
1911 {
1912 // get the next non-whitespace token.
1913 AddressToken first = getNonBlank();
1914 // if this is the end, we return null.
1915 if (first.type == END_OF_TOKENS) {
1916 return null;
1917 }
1918
1919 AddressToken last = first;
1920
1921 // the methods for retrieving tokens skip over whitespace, so we're going to process this
1922 // by index.
1923 currentToken++;
1924
1925 AddressToken token = currentToken();
1926 while (true) {
1927 // if this is our marker, then pluck out the section and return it.
1928 if (token.type == END_OF_TOKENS || token.type == WHITESPACE) {
1929 return section(first, last);
1930 }
1931 last = token;
1932 currentToken++;
1933 // we accept any and all tokens here.
1934 token = currentToken();
1935 }
1936 }
1937
1938 /**
1939 * Return the index of the current cursor position.
1940 *
1941 * @return The integer index of the current token.
1942 */
1943 public int currentIndex() {
1944 return currentToken;
1945 }
1946
1947 public void dumpTokens()
1948 {
1949 System.out.println(">>>>>>>>> Start dumping TokenStream tokens");
1950 for (int i = 0; i < tokens.size(); i++) {
1951 System.out.println("-------- Token: " + tokens.get(i));
1952 }
1953
1954 System.out.println("++++++++ cursor position=" + currentToken);
1955 System.out.println(">>>>>>>>> End dumping TokenStream tokens");
1956 }
1957 }
1958
1959
1960 /**
1961 * Simple utility class for representing address tokens.
1962 */
1963 public class AddressToken {
1964
1965 // the token type
1966 int type;
1967
1968 // string value of the token (can be null)
1969 String value;
1970
1971 // position of the token within the address string.
1972 int position;
1973
1974 AddressToken(int type, int position)
1975 {
1976 this.type = type;
1977 this.value = null;
1978 this.position = position;
1979 }
1980
1981 AddressToken(String value, int type, int position)
1982 {
1983 this.type = type;
1984 this.value = value;
1985 this.position = position;
1986 }
1987
1988 public String toString()
1989 {
1990 if (type == END_OF_TOKENS) {
1991 return "AddressToken: type=END_OF_TOKENS";
1992 }
1993 if (value == null) {
1994 return "AddressToken: type=" + (char)type;
1995 }
1996 else {
1997 return "AddressToken: type=" + (char)type + " value=" + value;
1998 }
1999 }
2000 }
2001 }
2002