001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 /* 019 * This code has been borrowed from the Apache Xerces project. We're copying the code to 020 * keep from adding a dependency on Xerces in the Geronimo kernel. 021 */ 022 023 package org.apache.geronimo.system.configuration; 024 025 import org.w3c.dom.Document; 026 import org.w3c.dom.DocumentType; 027 import org.w3c.dom.Node; 028 import org.w3c.dom.html.HTMLDocument; 029 030 031 /** 032 * Specifies an output format to control the serializer. Based on the 033 * XSLT specification for output format, plus additional parameters. 034 * Used to select the suitable serializer and determine how the 035 * document should be formatted on output. 036 * <p> 037 * The two interesting constructors are: 038 * <ul> 039 * <li>{@link #OutputFormat(String,String,boolean)} creates a format 040 * for the specified method (XML, HTML, Text, etc), encoding and indentation 041 * <li>{@link #OutputFormat(Document,String,boolean)} creates a format 042 * compatible with the document type (XML, HTML, Text, etc), encoding and 043 * indentation 044 * </ul> 045 * 046 * 047 * @version $Revision: 476049 $ $Date: 2006-11-16 23:35:17 -0500 (Thu, 16 Nov 2006) $ 048 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a> 049 * <a href="mailto:visco@intalio.com">Keith Visco</a> 050 * @see Serializer 051 * @see Method 052 */ 053 public class OutputFormat 054 { 055 056 057 public static class DTD 058 { 059 060 /** 061 * Public identifier for HTML document type. 062 */ 063 public static final String HTMLPublicId = "-//W3C//DTD HTML 4.0//EN"; 064 065 /** 066 * System identifier for HTML document type. 067 */ 068 public static final String HTMLSystemId = 069 "http://www.w3.org/TR/WD-html-in-xml/DTD/xhtml1-strict.dtd"; 070 071 /** 072 * Public identifier for XHTML document type. 073 */ 074 public static final String XHTMLPublicId = 075 "-//W3C//DTD XHTML 1.0 Strict//EN"; 076 077 /** 078 * System identifier for XHTML document type. 079 */ 080 public static final String XHTMLSystemId = 081 "http://www.w3.org/TR/WD-html-in-xml/DTD/xhtml1-strict.dtd"; 082 083 } 084 085 086 public static class Defaults 087 { 088 089 /** 090 * If indentation is turned on, the default identation 091 * level is 4. 092 * 093 * @see #setIndenting(boolean) 094 */ 095 public static final int Indent = 4; 096 097 /** 098 * The default encoding for Web documents it UTF-8. 099 * 100 * @see #getEncoding() 101 */ 102 public static final String Encoding = "UTF-8"; 103 104 /** 105 * The default line width at which to break long lines 106 * when identing. This is set to 72. 107 */ 108 public static final int LineWidth = 72; 109 110 } 111 112 113 /** 114 * Holds the output method specified for this document, 115 * or null if no method was specified. 116 */ 117 private String method; 118 119 120 /** 121 * Specifies the version of the output method. 122 */ 123 private String version; 124 125 126 /** 127 * The indentation level, or zero if no indentation 128 * was requested. 129 */ 130 private int indent = 0; 131 132 133 /** 134 * The encoding to use, if an input stream is used. 135 * The default is always UTF-8. 136 */ 137 private String encoding = Defaults.Encoding; 138 139 /** 140 * The EncodingInfo instance for _encoding. 141 */ 142 private EncodingInfo encodingInfo = null; 143 144 /** 145 * The specified media type or null. 146 */ 147 private String mediaType; 148 149 150 /** 151 * The specified document type system identifier, or null. 152 */ 153 private String doctypeSystem; 154 155 156 /** 157 * The specified document type public identifier, or null. 158 */ 159 private String doctypePublic; 160 161 162 /** 163 * Ture if the XML declaration should be ommited; 164 */ 165 private boolean omitXmlDeclaration = false; 166 167 168 /** 169 * Ture if the DOCTYPE declaration should be ommited; 170 */ 171 private boolean omitDoctype = false; 172 173 174 /** 175 * Ture if comments should be ommited; 176 */ 177 private boolean omitComments = false; 178 179 180 /** 181 * True if the document type should be marked as standalone. 182 */ 183 private boolean standalone = false; 184 185 186 /** 187 * List of element tag names whose text node children must 188 * be output as CDATA. 189 */ 190 private String[] cdataElements; 191 192 193 /** 194 * List of element tag names whose text node children must 195 * be output unescaped. 196 */ 197 private String[] nonEscapingElements; 198 199 200 /** 201 * The selected line separator. 202 */ 203 private String lineSeparator = "\n"; 204 205 206 /** 207 * The line width at which to wrap long lines when indenting. 208 */ 209 private int _lineWidth = Defaults.LineWidth; 210 211 212 /** 213 * True if spaces should be preserved in elements that do not 214 * specify otherwise, or specify the default behavior. 215 */ 216 private boolean preserve = false; 217 218 /** If true, an empty string valued attribute is output as "". If false and 219 * and we are using the HTMLSerializer, then only the attribute name is 220 * serialized. Defaults to false for backwards compatibility. 221 */ 222 private boolean preserveEmptyAttributes = false; 223 224 /** 225 * Constructs a new output format with the default values. 226 */ 227 public OutputFormat() 228 { 229 } 230 231 232 /** 233 * Constructs a new output format with the default values for 234 * the specified method and encoding. If <tt>indent</tt> 235 * is true, the document will be pretty printed with the default 236 * indentation level and default line wrapping. 237 * 238 * @param method The specified output method 239 * @param encoding The specified encoding 240 * @param indenting True for pretty printing 241 * @see #setEncoding 242 * @see #setIndenting 243 * @see #setMethod 244 */ 245 public OutputFormat( String method, String encoding, boolean indenting ) 246 { 247 setMethod( method ); 248 setEncoding( encoding ); 249 setIndenting( indenting ); 250 } 251 252 253 /** 254 * Constructs a new output format with the proper method, 255 * document type identifiers and media type for the specified 256 * document. 257 * 258 * @param doc The document to output 259 * @see #whichMethod 260 */ 261 public OutputFormat( Document doc ) 262 { 263 setMethod( whichMethod( doc ) ); 264 setDoctype( whichDoctypePublic( doc ), whichDoctypeSystem( doc ) ); 265 setMediaType( whichMediaType( getMethod() ) ); 266 } 267 268 269 /** 270 * Constructs a new output format with the proper method, 271 * document type identifiers and media type for the specified 272 * document, and with the specified encoding. If <tt>indent</tt> 273 * is true, the document will be pretty printed with the default 274 * indentation level and default line wrapping. 275 * 276 * @param doc The document to output 277 * @param encoding The specified encoding 278 * @param indenting True for pretty printing 279 * @see #setEncoding 280 * @see #setIndenting 281 * @see #whichMethod 282 */ 283 public OutputFormat( Document doc, String encoding, boolean indenting ) 284 { 285 this( doc ); 286 setEncoding( encoding ); 287 setIndenting( indenting ); 288 } 289 290 291 /** 292 * Returns the method specified for this output format. 293 * Typically the method will be <tt>xml</tt>, <tt>html</tt> 294 * or <tt>text</tt>, but it might be other values. 295 * If no method was specified, null will be returned 296 * and the most suitable method will be determined for 297 * the document by calling {@link #whichMethod}. 298 * 299 * @return The specified output method, or null 300 */ 301 public String getMethod() 302 { 303 return method; 304 } 305 306 307 /** 308 * Sets the method for this output format. 309 * 310 * @see #getMethod 311 * @param method The output method, or null 312 */ 313 public void setMethod( String method ) 314 { 315 this.method = method; 316 } 317 318 319 /** 320 * Returns the version for this output method. 321 * If no version was specified, will return null 322 * and the default version number will be used. 323 * If the serializerr does not support that particular 324 * version, it should default to a supported version. 325 * 326 * @return The specified method version, or null 327 */ 328 public String getVersion() 329 { 330 return version; 331 } 332 333 334 /** 335 * Sets the version for this output method. 336 * For XML the value would be "1.0", for HTML 337 * it would be "4.0". 338 * 339 * @see #getVersion 340 * @param version The output method version, or null 341 */ 342 public void setVersion( String version ) 343 { 344 this.version = version; 345 } 346 347 348 /** 349 * Returns the indentation specified. If no indentation 350 * was specified, zero is returned and the document 351 * should not be indented. 352 * 353 * @return The indentation or zero 354 * @see #setIndenting 355 */ 356 public int getIndent() 357 { 358 return indent; 359 } 360 361 362 /** 363 * Returns true if indentation was specified. 364 */ 365 public boolean getIndenting() 366 { 367 return ( indent > 0 ); 368 } 369 370 371 /** 372 * Sets the indentation. The document will not be 373 * indented if the indentation is set to zero. 374 * Calling {@link #setIndenting} will reset this 375 * value to zero (off) or the default (on). 376 * 377 * @param indent The indentation, or zero 378 */ 379 public void setIndent( int indent ) 380 { 381 if ( indent < 0 ) 382 this.indent = 0; 383 else 384 this.indent = indent; 385 } 386 387 388 /** 389 * Sets the indentation on and off. When set on, the default 390 * indentation level and default line wrapping is used 391 * (see {@link #DEFAULT_INDENT} and {@link #DEFAULT_LINE_WIDTH}). 392 * To specify a different indentation level or line wrapping, 393 * use {@link #setIndent} and {@link #setLineWidth}. 394 * 395 * @param on True if indentation should be on 396 */ 397 public void setIndenting( boolean on ) 398 { 399 if ( on ) { 400 indent = Defaults.Indent; 401 _lineWidth = Defaults.LineWidth; 402 } else { 403 indent = 0; 404 _lineWidth = 0; 405 } 406 } 407 408 409 /** 410 * Returns the specified encoding. If no encoding was 411 * specified, the default is always "UTF-8". 412 * 413 * @return The encoding 414 */ 415 public String getEncoding() 416 { 417 return encoding; 418 } 419 420 421 /** 422 * Sets the encoding for this output method. If no 423 * encoding was specified, the default is always "UTF-8". 424 * Make sure the encoding is compatible with the one 425 * used by the {@link java.io.Writer}. 426 * 427 * @see #getEncoding 428 * @param encoding The encoding, or null 429 */ 430 public void setEncoding( String encoding ) 431 { 432 this.encoding = encoding; 433 encodingInfo = null; 434 } 435 436 /** 437 * Sets the encoding for this output method with an <code>EncodingInfo</code> 438 * instance. 439 */ 440 public void setEncoding(EncodingInfo encInfo) { 441 encoding = encInfo.getName(); 442 encodingInfo = encInfo; 443 } 444 445 /** 446 * Returns an <code>EncodingInfo<code> instance for the encoding. 447 * 448 * @see setEncoding 449 */ 450 public EncodingInfo getEncodingInfo() { 451 if (encodingInfo == null) 452 encodingInfo = Encodings.getEncodingInfo(encoding); 453 return encodingInfo; 454 } 455 456 /** 457 * Returns the specified media type, or null. 458 * To determine the media type based on the 459 * document type, use {@link #whichMediaType}. 460 * 461 * @return The specified media type, or null 462 */ 463 public String getMediaType() 464 { 465 return mediaType; 466 } 467 468 469 /** 470 * Sets the media type. 471 * 472 * @see #getMediaType 473 * @param mediaType The specified media type 474 */ 475 public void setMediaType( String mediaType ) 476 { 477 this.mediaType = mediaType; 478 } 479 480 481 /** 482 * Sets the document type public and system identifiers. 483 * Required only if the DOM Document or SAX events do not 484 * specify the document type, and one must be present in 485 * the serialized document. Any document type specified 486 * by the DOM Document or SAX events will override these 487 * values. 488 * 489 * @param publicId The public identifier, or null 490 * @param systemId The system identifier, or null 491 */ 492 public void setDoctype( String publicId, String systemId ) 493 { 494 doctypePublic = publicId; 495 doctypeSystem = systemId; 496 } 497 498 499 /** 500 * Returns the specified document type public identifier, 501 * or null. 502 */ 503 public String getDoctypePublic() 504 { 505 return doctypePublic; 506 } 507 508 509 /** 510 * Returns the specified document type system identifier, 511 * or null. 512 */ 513 public String getDoctypeSystem() 514 { 515 return doctypeSystem; 516 } 517 518 519 /** 520 * Returns true if comments should be ommited. 521 * The default is false. 522 */ 523 public boolean getOmitComments() 524 { 525 return omitComments; 526 } 527 528 529 /** 530 * Sets comment omitting on and off. 531 * 532 * @param omit True if comments should be ommited 533 */ 534 public void setOmitComments( boolean omit ) 535 { 536 omitComments = omit; 537 } 538 539 540 /** 541 * Returns true if the DOCTYPE declaration should 542 * be ommited. The default is false. 543 */ 544 public boolean getOmitDocumentType() 545 { 546 return omitDoctype; 547 } 548 549 550 /** 551 * Sets DOCTYPE declaration omitting on and off. 552 * 553 * @param omit True if DOCTYPE declaration should be ommited 554 */ 555 public void setOmitDocumentType( boolean omit ) 556 { 557 omitDoctype = omit; 558 } 559 560 561 /** 562 * Returns true if the XML document declaration should 563 * be ommited. The default is false. 564 */ 565 public boolean getOmitXMLDeclaration() 566 { 567 return omitXmlDeclaration; 568 } 569 570 571 /** 572 * Sets XML declaration omitting on and off. 573 * 574 * @param omit True if XML declaration should be ommited 575 */ 576 public void setOmitXMLDeclaration( boolean omit ) 577 { 578 omitXmlDeclaration = omit; 579 } 580 581 582 /** 583 * Returns true if the document type is standalone. 584 * The default is false. 585 */ 586 public boolean getStandalone() 587 { 588 return standalone; 589 } 590 591 592 /** 593 * Sets document DTD standalone. The public and system 594 * identifiers must be null for the document to be 595 * serialized as standalone. 596 * 597 * @param standalone True if document DTD is standalone 598 */ 599 public void setStandalone( boolean standalone ) 600 { 601 this.standalone = standalone; 602 } 603 604 605 /** 606 * Returns a list of all the elements whose text node children 607 * should be output as CDATA, or null if no such elements were 608 * specified. 609 */ 610 public String[] getCDataElements() 611 { 612 return cdataElements; 613 } 614 615 616 /** 617 * Returns true if the text node children of the given elements 618 * should be output as CDATA. 619 * 620 * @param tagName The element's tag name 621 * @return True if should serialize as CDATA 622 */ 623 public boolean isCDataElement( String tagName ) 624 { 625 int i; 626 627 if ( cdataElements == null ) 628 return false; 629 for ( i = 0 ; i < cdataElements.length ; ++i ) 630 if ( cdataElements[ i ].equals( tagName ) ) 631 return true; 632 return false; 633 } 634 635 636 /** 637 * Sets the list of elements for which text node children 638 * should be output as CDATA. 639 * 640 * @param cdataElements List of CDATA element tag names 641 */ 642 public void setCDataElements( String[] cdataElements ) 643 { 644 this.cdataElements = cdataElements; 645 } 646 647 648 /** 649 * Returns a list of all the elements whose text node children 650 * should be output unescaped (no character references), or null 651 * if no such elements were specified. 652 */ 653 public String[] getNonEscapingElements() 654 { 655 return nonEscapingElements; 656 } 657 658 659 /** 660 * Returns true if the text node children of the given elements 661 * should be output unescaped. 662 * 663 * @param tagName The element's tag name 664 * @return True if should serialize unescaped 665 */ 666 public boolean isNonEscapingElement( String tagName ) 667 { 668 int i; 669 670 if ( nonEscapingElements == null ) 671 return false; 672 for ( i = 0 ; i < nonEscapingElements.length ; ++i ) 673 if ( nonEscapingElements[ i ].equals( tagName ) ) 674 return true; 675 return false; 676 } 677 678 679 /** 680 * Sets the list of elements for which text node children 681 * should be output unescaped (no character references). 682 * 683 * @param nonEscapingElements List of unescaped element tag names 684 */ 685 public void setNonEscapingElements( String[] nonEscapingElements ) 686 { 687 this.nonEscapingElements = nonEscapingElements; 688 } 689 690 691 692 /** 693 * Returns a specific line separator to use. The default is the 694 * Web line separator (<tt>\n</tt>). A string is returned to 695 * support double codes (CR + LF). 696 * 697 * @return The specified line separator 698 */ 699 public String getLineSeparator() 700 { 701 return lineSeparator; 702 } 703 704 705 /** 706 * Sets the line separator. The default is the Web line separator 707 * (<tt>\n</tt>). The machine's line separator can be obtained 708 * from the system property <tt>line.separator</tt>, but is only 709 * useful if the document is edited on machines of the same type. 710 * For general documents, use the Web line separator. 711 * 712 * @param lineSeparator The specified line separator 713 */ 714 public void setLineSeparator( String lineSeparator ) 715 { 716 if ( lineSeparator == null ) 717 this.lineSeparator = "\n"; 718 else 719 this.lineSeparator = lineSeparator; 720 } 721 722 723 /** 724 * Returns true if the default behavior for this format is to 725 * preserve spaces. All elements that do not specify otherwise 726 * or specify the default behavior will be formatted based on 727 * this rule. All elements that specify space preserving will 728 * always preserve space. 729 */ 730 public boolean getPreserveSpace() 731 { 732 return preserve; 733 } 734 735 736 /** 737 * Sets space preserving as the default behavior. The default is 738 * space stripping and all elements that do not specify otherwise 739 * or use the default value will not preserve spaces. 740 * 741 * @param preserve True if spaces should be preserved 742 */ 743 public void setPreserveSpace( boolean preserve ) 744 { 745 this.preserve = preserve; 746 } 747 748 749 /** 750 * Return the selected line width for breaking up long lines. 751 * When indenting, and only when indenting, long lines will be 752 * broken at space boundaries based on this line width. 753 * No line wrapping occurs if this value is zero. 754 */ 755 public int getLineWidth() 756 { 757 return _lineWidth; 758 } 759 760 761 /** 762 * Sets the line width. If zero then no line wrapping will 763 * occur. Calling {@link #setIndenting} will reset this 764 * value to zero (off) or the default (on). 765 * 766 * @param lineWidth The line width to use, zero for default 767 * @see #getLineWidth 768 * @see #setIndenting 769 */ 770 public void setLineWidth( int lineWidth ) 771 { 772 if ( lineWidth <= 0 ) 773 _lineWidth = 0; 774 else 775 _lineWidth = lineWidth; 776 } 777 778 /** 779 * Returns the preserveEmptyAttribute flag. If flag is false, then' 780 * attributes with empty string values are output as the attribute 781 * name only (in HTML mode). 782 * @return preserve the preserve flag 783 */ 784 public boolean getPreserveEmptyAttributes () { 785 return preserveEmptyAttributes; 786 } 787 /** 788 * Sets the preserveEmptyAttribute flag. If flag is false, then' 789 * attributes with empty string values are output as the attribute 790 * name only (in HTML mode). 791 * @param preserve the preserve flag 792 */ 793 public void setPreserveEmptyAttributes (boolean preserve) { 794 preserveEmptyAttributes = preserve; 795 } 796 797 /** 798 * Returns the last printable character based on the selected 799 * encoding. Control characters and non-printable characters 800 * are always printed as character references. 801 */ 802 public char getLastPrintable() 803 { 804 if ( getEncoding() != null && 805 ( getEncoding().equalsIgnoreCase( "ASCII" ) ) ) 806 return 0xFF; 807 else 808 return 0xFFFF; 809 } 810 811 812 /** 813 * Determine the output method for the specified document. 814 * If the document is an instance of {@link org.w3c.dom.html.HTMLDocument} 815 * then the method is said to be <tt>html</tt>. If the root 816 * element is 'html' and all text nodes preceding the root 817 * element are all whitespace, then the method is said to be 818 * <tt>html</tt>. Otherwise the method is <tt>xml</tt>. 819 * 820 * @param doc The document to check 821 * @return The suitable method 822 */ 823 public static String whichMethod( Document doc ) 824 { 825 Node node; 826 String value; 827 int i; 828 829 // If document is derived from HTMLDocument then the default 830 // method is html. 831 if ( doc instanceof HTMLDocument ) 832 return Method.HTML; 833 834 // Lookup the root element and the text nodes preceding it. 835 // If root element is html and all text nodes contain whitespace 836 // only, the method is html. 837 838 // FIXME (SM) should we care about namespaces here? 839 840 node = doc.getFirstChild(); 841 while (node != null) { 842 // If the root element is html, the method is html. 843 if ( node.getNodeType() == Node.ELEMENT_NODE ) { 844 if ( node.getNodeName().equalsIgnoreCase( "html" ) ) { 845 return Method.HTML; 846 } else if ( node.getNodeName().equalsIgnoreCase( "root" ) ) { 847 return Method.FOP; 848 } else { 849 return Method.XML; 850 } 851 } else if ( node.getNodeType() == Node.TEXT_NODE ) { 852 // If a text node preceding the root element contains 853 // only whitespace, this might be html, otherwise it's 854 // definitely xml. 855 value = node.getNodeValue(); 856 for ( i = 0 ; i < value.length() ; ++i ) 857 if ( value.charAt( i ) != 0x20 && value.charAt( i ) != 0x0A && 858 value.charAt( i ) != 0x09 && value.charAt( i ) != 0x0D ) 859 return Method.XML; 860 } 861 node = node.getNextSibling(); 862 } 863 // Anything else, the method is xml. 864 return Method.XML; 865 } 866 867 868 /** 869 * Returns the document type public identifier 870 * specified for this document, or null. 871 */ 872 public static String whichDoctypePublic( Document doc ) 873 { 874 DocumentType doctype; 875 876 /* DOM Level 2 was introduced into the code base*/ 877 doctype = doc.getDoctype(); 878 if ( doctype != null ) { 879 // Note on catch: DOM Level 1 does not specify this method 880 // and the code will throw a NoSuchMethodError 881 try { 882 return doctype.getPublicId(); 883 } catch ( Error except ) { } 884 } 885 886 if ( doc instanceof HTMLDocument ) 887 return DTD.XHTMLPublicId; 888 return null; 889 } 890 891 892 /** 893 * Returns the document type system identifier 894 * specified for this document, or null. 895 */ 896 public static String whichDoctypeSystem( Document doc ) 897 { 898 DocumentType doctype; 899 900 /* DOM Level 2 was introduced into the code base*/ 901 doctype = doc.getDoctype(); 902 if ( doctype != null ) { 903 // Note on catch: DOM Level 1 does not specify this method 904 // and the code will throw a NoSuchMethodError 905 try { 906 return doctype.getSystemId(); 907 } catch ( Error except ) { } 908 } 909 910 if ( doc instanceof HTMLDocument ) 911 return DTD.XHTMLSystemId; 912 return null; 913 } 914 915 916 /** 917 * Returns the suitable media format for a document 918 * output with the specified method. 919 */ 920 public static String whichMediaType( String method ) 921 { 922 if ( method.equalsIgnoreCase( Method.XML ) ) 923 return "text/xml"; 924 if ( method.equalsIgnoreCase( Method.HTML ) ) 925 return "text/html"; 926 if ( method.equalsIgnoreCase( Method.XHTML ) ) 927 return "text/html"; 928 if ( method.equalsIgnoreCase( Method.TEXT ) ) 929 return "text/plain"; 930 if ( method.equalsIgnoreCase( Method.FOP ) ) 931 return "application/pdf"; 932 return null; 933 } 934 935 936 } 937