View Javadoc

1   /**
2    *
3    * Copyright 2006 The Apache Software Foundation
4    *
5    *  Licensed under the Apache License, Version 2.0 (the "License");
6    *  you may not use this file except in compliance with the License.
7    *  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  
18  /*
19   * This code has been borrowed from the Apache Xerces project. We're copying the code to
20   * keep from adding a dependency on Xerces in the Geronimo kernel.
21   */
22  
23  package org.apache.geronimo.system.configuration;
24  
25  import org.w3c.dom.Document;
26  import org.w3c.dom.DocumentType;
27  import org.w3c.dom.Node;
28  import org.w3c.dom.html.HTMLDocument;
29  
30  
31  /**
32   * Specifies an output format to control the serializer. Based on the
33   * XSLT specification for output format, plus additional parameters.
34   * Used to select the suitable serializer and determine how the
35   * document should be formatted on output.
36   * <p>
37   * The two interesting constructors are:
38   * <ul>
39   * <li>{@link #OutputFormat(String,String,boolean)} creates a format
40   *  for the specified method (XML, HTML, Text, etc), encoding and indentation
41   * <li>{@link #OutputFormat(Document,String,boolean)} creates a format
42   *  compatible with the document type (XML, HTML, Text, etc), encoding and
43   *  indentation
44   * </ul>
45   *
46   *
47   * @version $Revision: 410741 $ $Date: 2006-05-31 21:35:48 -0700 (Wed, 31 May 2006) $
48   * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
49   *         <a href="mailto:visco@intalio.com">Keith Visco</a>
50   * @see Serializer
51   * @see Method
52   */
53  public class OutputFormat
54  {
55  
56  
57      public static class DTD
58      {
59  
60          /**
61           * Public identifier for HTML document type.
62           */
63          public static final String HTMLPublicId = "-//W3C//DTD HTML 4.0//EN";
64  
65          /**
66           * System identifier for HTML document type.
67           */
68          public static final String HTMLSystemId =
69              "http://www.w3.org/TR/WD-html-in-xml/DTD/xhtml1-strict.dtd";
70  
71          /**
72           * Public identifier for XHTML document type.
73           */
74          public static final String XHTMLPublicId =
75              "-//W3C//DTD XHTML 1.0 Strict//EN";
76  
77          /**
78           * System identifier for XHTML document type.
79           */
80          public static final String XHTMLSystemId =
81              "http://www.w3.org/TR/WD-html-in-xml/DTD/xhtml1-strict.dtd";
82  
83      }
84  
85  
86      public static class Defaults
87      {
88  
89          /**
90           * If indentation is turned on, the default identation
91           * level is 4.
92           *
93           * @see #setIndenting(boolean)
94           */
95          public static final int Indent = 4;
96  
97          /**
98           * The default encoding for Web documents it UTF-8.
99           *
100          * @see #getEncoding()
101          */
102         public static final String Encoding = "UTF-8";
103 
104         /**
105          * The default line width at which to break long lines
106          * when identing. This is set to 72.
107          */
108         public static final int LineWidth = 72;
109 
110     }
111 
112 
113     /**
114      * Holds the output method specified for this document,
115      * or null if no method was specified.
116      */
117     private String method;
118 
119 
120     /**
121      * Specifies the version of the output method.
122      */
123     private String version;
124 
125 
126     /**
127      * The indentation level, or zero if no indentation
128      * was requested.
129      */
130     private int indent = 0;
131 
132 
133     /**
134      * The encoding to use, if an input stream is used.
135      * The default is always UTF-8.
136      */
137     private String encoding = Defaults.Encoding;
138 
139     /**
140      * The EncodingInfo instance for _encoding.
141      */
142     private EncodingInfo encodingInfo = null;
143 
144     /**
145      * The specified media type or null.
146      */
147     private String mediaType;
148 
149 
150     /**
151      * The specified document type system identifier, or null.
152      */
153     private String doctypeSystem;
154 
155 
156     /**
157      * The specified document type public identifier, or null.
158      */
159     private String doctypePublic;
160 
161 
162     /**
163      * Ture if the XML declaration should be ommited;
164      */
165     private boolean omitXmlDeclaration = false;
166 
167 
168     /**
169      * Ture if the DOCTYPE declaration should be ommited;
170      */
171     private boolean omitDoctype = false;
172 
173 
174     /**
175      * Ture if comments should be ommited;
176      */
177     private boolean omitComments = false;
178 
179 
180     /**
181      * True if the document type should be marked as standalone.
182      */
183     private boolean standalone = false;
184 
185 
186     /**
187      * List of element tag names whose text node children must
188      * be output as CDATA.
189      */
190     private String[] cdataElements;
191 
192 
193     /**
194      * List of element tag names whose text node children must
195      * be output unescaped.
196      */
197     private String[] nonEscapingElements;
198 
199 
200     /**
201      * The selected line separator.
202      */
203     private String lineSeparator = "\n";
204 
205 
206     /**
207      * The line width at which to wrap long lines when indenting.
208      */
209     private int _lineWidth = Defaults.LineWidth;
210 
211 
212     /**
213      * True if spaces should be preserved in elements that do not
214      * specify otherwise, or specify the default behavior.
215      */
216     private boolean preserve = false;
217     
218     /** If true, an empty string valued attribute is output as "". If false and
219      * and we are using the HTMLSerializer, then only the attribute name is 
220      * serialized. Defaults to false for backwards compatibility.
221      */
222     private boolean preserveEmptyAttributes = false;
223 
224     /**
225      * Constructs a new output format with the default values.
226      */
227     public OutputFormat()
228     {
229     }
230 
231 
232     /**
233      * Constructs a new output format with the default values for
234      * the specified method and encoding. If <tt>indent</tt>
235      * is true, the document will be pretty printed with the default
236      * indentation level and default line wrapping.
237      *
238      * @param method The specified output method
239      * @param encoding The specified encoding
240      * @param indenting True for pretty printing
241      * @see #setEncoding
242      * @see #setIndenting
243      * @see #setMethod
244      */
245     public OutputFormat( String method, String encoding, boolean indenting )
246     {
247         setMethod( method );
248         setEncoding( encoding );
249         setIndenting( indenting );
250     }
251 
252 
253     /**
254      * Constructs a new output format with the proper method,
255      * document type identifiers and media type for the specified
256      * document.
257      *
258      * @param doc The document to output
259      * @see #whichMethod
260      */
261     public OutputFormat( Document doc )
262     {
263         setMethod( whichMethod( doc ) );
264         setDoctype( whichDoctypePublic( doc ), whichDoctypeSystem( doc ) );
265         setMediaType( whichMediaType( getMethod() ) );
266     }
267 
268 
269     /**
270      * Constructs a new output format with the proper method,
271      * document type identifiers and media type for the specified
272      * document, and with the specified encoding. If <tt>indent</tt>
273      * is true, the document will be pretty printed with the default
274      * indentation level and default line wrapping.
275      *
276      * @param doc The document to output
277      * @param encoding The specified encoding
278      * @param indenting True for pretty printing
279      * @see #setEncoding
280      * @see #setIndenting
281      * @see #whichMethod
282      */
283     public OutputFormat( Document doc, String encoding, boolean indenting )
284     {
285         this( doc );
286         setEncoding( encoding );
287         setIndenting( indenting );
288     }
289 
290 
291     /**
292      * Returns the method specified for this output format.
293      * Typically the method will be <tt>xml</tt>, <tt>html</tt>
294      * or <tt>text</tt>, but it might be other values.
295      * If no method was specified, null will be returned
296      * and the most suitable method will be determined for
297      * the document by calling {@link #whichMethod}.
298      *
299      * @return The specified output method, or null
300      */
301     public String getMethod()
302     {
303         return method;
304     }
305 
306 
307     /**
308      * Sets the method for this output format.
309      *
310      * @see #getMethod
311      * @param method The output method, or null
312      */
313     public void setMethod( String method )
314     {
315         this.method = method;
316     }
317 
318 
319     /**
320      * Returns the version for this output method.
321      * If no version was specified, will return null
322      * and the default version number will be used.
323      * If the serializerr does not support that particular
324      * version, it should default to a supported version.
325      *
326      * @return The specified method version, or null
327      */
328     public String getVersion()
329     {
330         return version;
331     }
332 
333 
334     /**
335      * Sets the version for this output method.
336      * For XML the value would be "1.0", for HTML
337      * it would be "4.0".
338      *
339      * @see #getVersion
340      * @param version The output method version, or null
341      */
342     public void setVersion( String version )
343     {
344         this.version = version;
345     }
346 
347 
348     /**
349      * Returns the indentation specified. If no indentation
350      * was specified, zero is returned and the document
351      * should not be indented.
352      *
353      * @return The indentation or zero
354      * @see #setIndenting
355      */
356     public int getIndent()
357     {
358         return indent;
359     }
360 
361 
362     /**
363      * Returns true if indentation was specified.
364      */
365     public boolean getIndenting()
366     {
367         return ( indent > 0 );
368     }
369 
370 
371     /**
372      * Sets the indentation. The document will not be
373      * indented if the indentation is set to zero.
374      * Calling {@link #setIndenting} will reset this
375      * value to zero (off) or the default (on).
376      *
377      * @param indent The indentation, or zero
378      */
379     public void setIndent( int indent )
380     {
381         if ( indent < 0 )
382             this.indent = 0;
383         else
384             this.indent = indent;
385     }
386 
387 
388     /**
389      * Sets the indentation on and off. When set on, the default
390      * indentation level and default line wrapping is used
391      * (see {@link #DEFAULT_INDENT} and {@link #DEFAULT_LINE_WIDTH}).
392      * To specify a different indentation level or line wrapping,
393      * use {@link #setIndent} and {@link #setLineWidth}.
394      *
395      * @param on True if indentation should be on
396      */
397     public void setIndenting( boolean on )
398     {
399         if ( on ) {
400             indent = Defaults.Indent;
401             _lineWidth = Defaults.LineWidth;
402         } else {
403             indent = 0;
404             _lineWidth = 0;
405         }
406     }
407 
408 
409     /**
410      * Returns the specified encoding. If no encoding was
411      * specified, the default is always "UTF-8".
412      *
413      * @return The encoding
414      */
415     public String getEncoding()
416     {
417         return encoding;
418     }
419 
420 
421     /**
422      * Sets the encoding for this output method. If no
423      * encoding was specified, the default is always "UTF-8".
424      * Make sure the encoding is compatible with the one
425      * used by the {@link java.io.Writer}.
426      *
427      * @see #getEncoding
428      * @param encoding The encoding, or null
429      */
430     public void setEncoding( String encoding )
431     {
432         this.encoding = encoding;
433         encodingInfo = null;
434     }
435 
436     /**
437      * Sets the encoding for this output method with an <code>EncodingInfo</code>
438      * instance.
439      */
440     public void setEncoding(EncodingInfo encInfo) {
441         encoding = encInfo.getName();
442         encodingInfo = encInfo;
443     }
444 
445     /**
446      * Returns an <code>EncodingInfo<code> instance for the encoding.
447      *
448      * @see setEncoding
449      */
450     public EncodingInfo getEncodingInfo() {
451         if (encodingInfo == null)
452             encodingInfo = Encodings.getEncodingInfo(encoding);
453         return encodingInfo;
454     }
455 
456     /**
457      * Returns the specified media type, or null.
458      * To determine the media type based on the
459      * document type, use {@link #whichMediaType}.
460      *
461      * @return The specified media type, or null
462      */
463     public String getMediaType()
464     {
465         return mediaType;
466     }
467 
468 
469     /**
470      * Sets the media type.
471      *
472      * @see #getMediaType
473      * @param mediaType The specified media type
474      */
475     public void setMediaType( String mediaType )
476     {
477         this.mediaType = mediaType;
478     }
479 
480 
481     /**
482      * Sets the document type public and system identifiers.
483      * Required only if the DOM Document or SAX events do not
484      * specify the document type, and one must be present in
485      * the serialized document. Any document type specified
486      * by the DOM Document or SAX events will override these
487      * values.
488      *
489      * @param publicId The public identifier, or null
490      * @param systemId The system identifier, or null
491      */
492     public void setDoctype( String publicId, String systemId )
493     {
494         doctypePublic = publicId;
495         doctypeSystem = systemId;
496     }
497 
498 
499     /**
500      * Returns the specified document type public identifier,
501      * or null.
502      */
503     public String getDoctypePublic()
504     {
505         return doctypePublic;
506     }
507 
508 
509     /**
510      * Returns the specified document type system identifier,
511      * or null.
512      */
513     public String getDoctypeSystem()
514     {
515         return doctypeSystem;
516     }
517 
518 
519     /**
520      * Returns true if comments should be ommited.
521      * The default is false.
522      */
523     public boolean getOmitComments()
524     {
525         return omitComments;
526     }
527 
528 
529     /**
530      * Sets comment omitting on and off.
531      *
532      * @param omit True if comments should be ommited
533      */
534     public void setOmitComments( boolean omit )
535     {
536         omitComments = omit;
537     }
538 
539 
540     /**
541      * Returns true if the DOCTYPE declaration should
542      * be ommited. The default is false.
543      */
544     public boolean getOmitDocumentType()
545     {
546         return omitDoctype;
547     }
548 
549 
550     /**
551      * Sets DOCTYPE declaration omitting on and off.
552      *
553      * @param omit True if DOCTYPE declaration should be ommited
554      */
555     public void setOmitDocumentType( boolean omit )
556     {
557         omitDoctype = omit;
558     }
559 
560 
561     /**
562      * Returns true if the XML document declaration should
563      * be ommited. The default is false.
564      */
565     public boolean getOmitXMLDeclaration()
566     {
567         return omitXmlDeclaration;
568     }
569 
570 
571     /**
572      * Sets XML declaration omitting on and off.
573      *
574      * @param omit True if XML declaration should be ommited
575      */
576     public void setOmitXMLDeclaration( boolean omit )
577     {
578         omitXmlDeclaration = omit;
579     }
580 
581 
582     /**
583      * Returns true if the document type is standalone.
584      * The default is false.
585      */
586     public boolean getStandalone()
587     {
588         return standalone;
589     }
590 
591 
592     /**
593      * Sets document DTD standalone. The public and system
594      * identifiers must be null for the document to be
595      * serialized as standalone.
596      *
597      * @param standalone True if document DTD is standalone
598      */
599     public void setStandalone( boolean standalone )
600     {
601         this.standalone = standalone;
602     }
603 
604 
605     /**
606      * Returns a list of all the elements whose text node children
607      * should be output as CDATA, or null if no such elements were
608      * specified.
609      */
610     public String[] getCDataElements()
611     {
612         return cdataElements;
613     }
614 
615 
616     /**
617      * Returns true if the text node children of the given elements
618      * should be output as CDATA.
619      *
620      * @param tagName The element's tag name
621      * @return True if should serialize as CDATA
622      */
623     public boolean isCDataElement( String tagName )
624     {
625         int i;
626 
627         if ( cdataElements == null )
628             return false;
629         for ( i = 0 ; i < cdataElements.length ; ++i )
630             if ( cdataElements[ i ].equals( tagName ) )
631                 return true;
632         return false;
633     }
634 
635 
636     /**
637      * Sets the list of elements for which text node children
638      * should be output as CDATA.
639      *
640      * @param cdataElements List of CDATA element tag names
641      */
642     public void setCDataElements( String[] cdataElements )
643     {
644         this.cdataElements = cdataElements;
645     }
646 
647 
648     /**
649      * Returns a list of all the elements whose text node children
650      * should be output unescaped (no character references), or null
651      * if no such elements were specified.
652      */
653     public String[] getNonEscapingElements()
654     {
655         return nonEscapingElements;
656     }
657 
658 
659     /**
660      * Returns true if the text node children of the given elements
661      * should be output unescaped.
662      *
663      * @param tagName The element's tag name
664      * @return True if should serialize unescaped
665      */
666     public boolean isNonEscapingElement( String tagName )
667     {
668         int i;
669 
670         if ( nonEscapingElements == null )
671             return false;
672         for ( i = 0 ; i < nonEscapingElements.length ; ++i )
673             if ( nonEscapingElements[ i ].equals( tagName ) )
674                 return true;
675         return false;
676     }
677 
678 
679     /**
680      * Sets the list of elements for which text node children
681      * should be output unescaped (no character references).
682      *
683      * @param nonEscapingElements List of unescaped element tag names
684      */
685     public void setNonEscapingElements( String[] nonEscapingElements )
686     {
687         this.nonEscapingElements = nonEscapingElements;
688     }
689 
690 
691 
692     /**
693      * Returns a specific line separator to use. The default is the
694      * Web line separator (<tt>\n</tt>). A string is returned to
695      * support double codes (CR + LF).
696      *
697      * @return The specified line separator
698      */
699     public String getLineSeparator()
700     {
701         return lineSeparator;
702     }
703 
704 
705     /**
706      * Sets the line separator. The default is the Web line separator
707      * (<tt>\n</tt>). The machine's line separator can be obtained
708      * from the system property <tt>line.separator</tt>, but is only
709      * useful if the document is edited on machines of the same type.
710      * For general documents, use the Web line separator.
711      *
712      * @param lineSeparator The specified line separator
713      */
714     public void setLineSeparator( String lineSeparator )
715     {
716         if ( lineSeparator == null )
717             this.lineSeparator =  "\n";
718         else
719             this.lineSeparator = lineSeparator;
720     }
721 
722 
723     /**
724      * Returns true if the default behavior for this format is to
725      * preserve spaces. All elements that do not specify otherwise
726      * or specify the default behavior will be formatted based on
727      * this rule. All elements that specify space preserving will
728      * always preserve space.
729      */
730     public boolean getPreserveSpace()
731     {
732         return preserve;
733     }
734 
735 
736     /**
737      * Sets space preserving as the default behavior. The default is
738      * space stripping and all elements that do not specify otherwise
739      * or use the default value will not preserve spaces.
740      *
741      * @param preserve True if spaces should be preserved
742      */
743     public void setPreserveSpace( boolean preserve )
744     {
745         this.preserve = preserve;
746     }
747 
748 
749     /**
750      * Return the selected line width for breaking up long lines.
751      * When indenting, and only when indenting, long lines will be
752      * broken at space boundaries based on this line width.
753      * No line wrapping occurs if this value is zero.
754      */
755     public int getLineWidth()
756     {
757         return _lineWidth;
758     }
759 
760 
761     /**
762      * Sets the line width. If zero then no line wrapping will
763      * occur. Calling {@link #setIndenting} will reset this
764      * value to zero (off) or the default (on).
765      *
766      * @param lineWidth The line width to use, zero for default
767      * @see #getLineWidth
768      * @see #setIndenting
769      */
770     public void setLineWidth( int lineWidth )
771     {
772         if ( lineWidth <= 0 )
773             _lineWidth = 0;
774         else
775             _lineWidth = lineWidth;
776     }
777 
778     /**
779      * Returns the preserveEmptyAttribute flag. If flag is false, then'
780      * attributes with empty string values are output as the attribute 
781      * name only (in HTML mode).
782      * @return preserve the preserve flag
783      */
784     public boolean getPreserveEmptyAttributes () {
785         return preserveEmptyAttributes;
786     }
787     /**
788      * Sets the preserveEmptyAttribute flag. If flag is false, then'
789      * attributes with empty string values are output as the attribute 
790      * name only (in HTML mode).
791      * @param preserve the preserve flag
792      */
793     public void setPreserveEmptyAttributes (boolean preserve) {
794         preserveEmptyAttributes = preserve;
795     }
796 
797     /**
798      * Returns the last printable character based on the selected
799      * encoding. Control characters and non-printable characters
800      * are always printed as character references.
801      */
802     public char getLastPrintable()
803     {
804         if ( getEncoding() != null &&
805              ( getEncoding().equalsIgnoreCase( "ASCII" ) ) )
806             return 0xFF;
807         else
808             return 0xFFFF;
809     }
810 
811 
812     /**
813      * Determine the output method for the specified document.
814      * If the document is an instance of {@link org.w3c.dom.html.HTMLDocument}
815      * then the method is said to be <tt>html</tt>. If the root
816      * element is 'html' and all text nodes preceding the root
817      * element are all whitespace, then the method is said to be
818      * <tt>html</tt>. Otherwise the method is <tt>xml</tt>.
819      *
820      * @param doc The document to check
821      * @return The suitable method
822      */
823     public static String whichMethod( Document doc )
824     {
825         Node    node;
826         String  value;
827         int     i;
828 
829         // If document is derived from HTMLDocument then the default
830         // method is html.
831         if ( doc instanceof HTMLDocument )
832             return Method.HTML;
833 
834         // Lookup the root element and the text nodes preceding it.
835         // If root element is html and all text nodes contain whitespace
836         // only, the method is html.
837 
838         // FIXME (SM) should we care about namespaces here?
839 
840         node = doc.getFirstChild();
841         while (node != null) {
842             // If the root element is html, the method is html.
843             if ( node.getNodeType() == Node.ELEMENT_NODE ) {
844                 if ( node.getNodeName().equalsIgnoreCase( "html" ) ) {
845                     return Method.HTML;
846                 } else if ( node.getNodeName().equalsIgnoreCase( "root" ) ) {
847                     return Method.FOP;
848                 } else {
849                     return Method.XML;
850                 }
851             } else if ( node.getNodeType() == Node.TEXT_NODE ) {
852                 // If a text node preceding the root element contains
853                 // only whitespace, this might be html, otherwise it's
854                 // definitely xml.
855                 value = node.getNodeValue();
856                 for ( i = 0 ; i < value.length() ; ++i )
857                     if ( value.charAt( i ) != 0x20 && value.charAt( i ) != 0x0A &&
858                          value.charAt( i ) != 0x09 && value.charAt( i ) != 0x0D )
859                         return Method.XML;
860             }
861             node = node.getNextSibling();
862         }
863         // Anything else, the method is xml.
864         return Method.XML;
865     }
866 
867 
868     /**
869      * Returns the document type public identifier
870      * specified for this document, or null.
871      */
872     public static String whichDoctypePublic( Document doc )
873     {
874         DocumentType doctype;
875 
876            /*  DOM Level 2 was introduced into the code base*/
877            doctype = doc.getDoctype();
878            if ( doctype != null ) {
879            // Note on catch: DOM Level 1 does not specify this method
880            // and the code will throw a NoSuchMethodError
881            try {
882            return doctype.getPublicId();
883            } catch ( Error except ) {  }
884            }
885         
886         if ( doc instanceof HTMLDocument )
887             return DTD.XHTMLPublicId;
888         return null;
889     }
890 
891 
892     /**
893      * Returns the document type system identifier
894      * specified for this document, or null.
895      */
896     public static String whichDoctypeSystem( Document doc )
897     {
898         DocumentType doctype;
899 
900         /* DOM Level 2 was introduced into the code base*/
901            doctype = doc.getDoctype();
902            if ( doctype != null ) {
903            // Note on catch: DOM Level 1 does not specify this method
904            // and the code will throw a NoSuchMethodError
905            try {
906            return doctype.getSystemId();
907            } catch ( Error except ) { }
908            }
909         
910         if ( doc instanceof HTMLDocument )
911             return DTD.XHTMLSystemId;
912         return null;
913     }
914 
915 
916     /**
917      * Returns the suitable media format for a document
918      * output with the specified method.
919      */
920     public static String whichMediaType( String method )
921     {
922         if ( method.equalsIgnoreCase( Method.XML ) )
923             return "text/xml";
924         if ( method.equalsIgnoreCase( Method.HTML ) )
925             return "text/html";
926         if ( method.equalsIgnoreCase( Method.XHTML ) )
927             return "text/html";
928         if ( method.equalsIgnoreCase( Method.TEXT ) )
929             return "text/plain";
930         if ( method.equalsIgnoreCase( Method.FOP ) )
931             return "application/pdf";
932         return null;
933     }
934 
935 
936 }
937