001    /**
002     *  Licensed to the Apache Software Foundation (ASF) under one or more
003     *  contributor license agreements.  See the NOTICE file distributed with
004     *  this work for additional information regarding copyright ownership.
005     *  The ASF licenses this file to You under the Apache License, Version 2.0
006     *  (the "License"); you may not use this file except in compliance with
007     *  the License.  You may obtain a copy of the License at
008     *
009     *     http://www.apache.org/licenses/LICENSE-2.0
010     *
011     *  Unless required by applicable law or agreed to in writing, software
012     *  distributed under the License is distributed on an "AS IS" BASIS,
013     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     *  See the License for the specific language governing permissions and
015     *  limitations under the License.
016     */
017    
018    /*
019     * This code has been borrowed from the Apache Xerces project. We're copying the code to
020     * keep from adding a dependency on Xerces in the Geronimo kernel.
021     */
022    
023    package org.apache.geronimo.system.configuration;
024    
025    import org.w3c.dom.Document;
026    import org.w3c.dom.DocumentType;
027    import org.w3c.dom.Node;
028    import org.w3c.dom.html.HTMLDocument;
029    
030    
031    /**
032     * Specifies an output format to control the serializer. Based on the
033     * XSLT specification for output format, plus additional parameters.
034     * Used to select the suitable serializer and determine how the
035     * document should be formatted on output.
036     * <p>
037     * The two interesting constructors are:
038     * <ul>
039     * <li>{@link #OutputFormat(String,String,boolean)} creates a format
040     *  for the specified method (XML, HTML, Text, etc), encoding and indentation
041     * <li>{@link #OutputFormat(Document,String,boolean)} creates a format
042     *  compatible with the document type (XML, HTML, Text, etc), encoding and
043     *  indentation
044     * </ul>
045     *
046     *
047     * @version $Revision: 476049 $ $Date: 2006-11-16 23:35:17 -0500 (Thu, 16 Nov 2006) $
048     * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
049     *         <a href="mailto:visco@intalio.com">Keith Visco</a>
050     * @see Serializer
051     * @see Method
052     */
053    public class OutputFormat
054    {
055    
056    
057        public static class DTD
058        {
059    
060            /**
061             * Public identifier for HTML document type.
062             */
063            public static final String HTMLPublicId = "-//W3C//DTD HTML 4.0//EN";
064    
065            /**
066             * System identifier for HTML document type.
067             */
068            public static final String HTMLSystemId =
069                "http://www.w3.org/TR/WD-html-in-xml/DTD/xhtml1-strict.dtd";
070    
071            /**
072             * Public identifier for XHTML document type.
073             */
074            public static final String XHTMLPublicId =
075                "-//W3C//DTD XHTML 1.0 Strict//EN";
076    
077            /**
078             * System identifier for XHTML document type.
079             */
080            public static final String XHTMLSystemId =
081                "http://www.w3.org/TR/WD-html-in-xml/DTD/xhtml1-strict.dtd";
082    
083        }
084    
085    
086        public static class Defaults
087        {
088    
089            /**
090             * If indentation is turned on, the default identation
091             * level is 4.
092             *
093             * @see #setIndenting(boolean)
094             */
095            public static final int Indent = 4;
096    
097            /**
098             * The default encoding for Web documents it UTF-8.
099             *
100             * @see #getEncoding()
101             */
102            public static final String Encoding = "UTF-8";
103    
104            /**
105             * The default line width at which to break long lines
106             * when identing. This is set to 72.
107             */
108            public static final int LineWidth = 72;
109    
110        }
111    
112    
113        /**
114         * Holds the output method specified for this document,
115         * or null if no method was specified.
116         */
117        private String method;
118    
119    
120        /**
121         * Specifies the version of the output method.
122         */
123        private String version;
124    
125    
126        /**
127         * The indentation level, or zero if no indentation
128         * was requested.
129         */
130        private int indent = 0;
131    
132    
133        /**
134         * The encoding to use, if an input stream is used.
135         * The default is always UTF-8.
136         */
137        private String encoding = Defaults.Encoding;
138    
139        /**
140         * The EncodingInfo instance for _encoding.
141         */
142        private EncodingInfo encodingInfo = null;
143    
144        /**
145         * The specified media type or null.
146         */
147        private String mediaType;
148    
149    
150        /**
151         * The specified document type system identifier, or null.
152         */
153        private String doctypeSystem;
154    
155    
156        /**
157         * The specified document type public identifier, or null.
158         */
159        private String doctypePublic;
160    
161    
162        /**
163         * Ture if the XML declaration should be ommited;
164         */
165        private boolean omitXmlDeclaration = false;
166    
167    
168        /**
169         * Ture if the DOCTYPE declaration should be ommited;
170         */
171        private boolean omitDoctype = false;
172    
173    
174        /**
175         * Ture if comments should be ommited;
176         */
177        private boolean omitComments = false;
178    
179    
180        /**
181         * True if the document type should be marked as standalone.
182         */
183        private boolean standalone = false;
184    
185    
186        /**
187         * List of element tag names whose text node children must
188         * be output as CDATA.
189         */
190        private String[] cdataElements;
191    
192    
193        /**
194         * List of element tag names whose text node children must
195         * be output unescaped.
196         */
197        private String[] nonEscapingElements;
198    
199    
200        /**
201         * The selected line separator.
202         */
203        private String lineSeparator = "\n";
204    
205    
206        /**
207         * The line width at which to wrap long lines when indenting.
208         */
209        private int _lineWidth = Defaults.LineWidth;
210    
211    
212        /**
213         * True if spaces should be preserved in elements that do not
214         * specify otherwise, or specify the default behavior.
215         */
216        private boolean preserve = false;
217        
218        /** If true, an empty string valued attribute is output as "". If false and
219         * and we are using the HTMLSerializer, then only the attribute name is 
220         * serialized. Defaults to false for backwards compatibility.
221         */
222        private boolean preserveEmptyAttributes = false;
223    
224        /**
225         * Constructs a new output format with the default values.
226         */
227        public OutputFormat()
228        {
229        }
230    
231    
232        /**
233         * Constructs a new output format with the default values for
234         * the specified method and encoding. If <tt>indent</tt>
235         * is true, the document will be pretty printed with the default
236         * indentation level and default line wrapping.
237         *
238         * @param method The specified output method
239         * @param encoding The specified encoding
240         * @param indenting True for pretty printing
241         * @see #setEncoding
242         * @see #setIndenting
243         * @see #setMethod
244         */
245        public OutputFormat( String method, String encoding, boolean indenting )
246        {
247            setMethod( method );
248            setEncoding( encoding );
249            setIndenting( indenting );
250        }
251    
252    
253        /**
254         * Constructs a new output format with the proper method,
255         * document type identifiers and media type for the specified
256         * document.
257         *
258         * @param doc The document to output
259         * @see #whichMethod
260         */
261        public OutputFormat( Document doc )
262        {
263            setMethod( whichMethod( doc ) );
264            setDoctype( whichDoctypePublic( doc ), whichDoctypeSystem( doc ) );
265            setMediaType( whichMediaType( getMethod() ) );
266        }
267    
268    
269        /**
270         * Constructs a new output format with the proper method,
271         * document type identifiers and media type for the specified
272         * document, and with the specified encoding. If <tt>indent</tt>
273         * is true, the document will be pretty printed with the default
274         * indentation level and default line wrapping.
275         *
276         * @param doc The document to output
277         * @param encoding The specified encoding
278         * @param indenting True for pretty printing
279         * @see #setEncoding
280         * @see #setIndenting
281         * @see #whichMethod
282         */
283        public OutputFormat( Document doc, String encoding, boolean indenting )
284        {
285            this( doc );
286            setEncoding( encoding );
287            setIndenting( indenting );
288        }
289    
290    
291        /**
292         * Returns the method specified for this output format.
293         * Typically the method will be <tt>xml</tt>, <tt>html</tt>
294         * or <tt>text</tt>, but it might be other values.
295         * If no method was specified, null will be returned
296         * and the most suitable method will be determined for
297         * the document by calling {@link #whichMethod}.
298         *
299         * @return The specified output method, or null
300         */
301        public String getMethod()
302        {
303            return method;
304        }
305    
306    
307        /**
308         * Sets the method for this output format.
309         *
310         * @see #getMethod
311         * @param method The output method, or null
312         */
313        public void setMethod( String method )
314        {
315            this.method = method;
316        }
317    
318    
319        /**
320         * Returns the version for this output method.
321         * If no version was specified, will return null
322         * and the default version number will be used.
323         * If the serializerr does not support that particular
324         * version, it should default to a supported version.
325         *
326         * @return The specified method version, or null
327         */
328        public String getVersion()
329        {
330            return version;
331        }
332    
333    
334        /**
335         * Sets the version for this output method.
336         * For XML the value would be "1.0", for HTML
337         * it would be "4.0".
338         *
339         * @see #getVersion
340         * @param version The output method version, or null
341         */
342        public void setVersion( String version )
343        {
344            this.version = version;
345        }
346    
347    
348        /**
349         * Returns the indentation specified. If no indentation
350         * was specified, zero is returned and the document
351         * should not be indented.
352         *
353         * @return The indentation or zero
354         * @see #setIndenting
355         */
356        public int getIndent()
357        {
358            return indent;
359        }
360    
361    
362        /**
363         * Returns true if indentation was specified.
364         */
365        public boolean getIndenting()
366        {
367            return ( indent > 0 );
368        }
369    
370    
371        /**
372         * Sets the indentation. The document will not be
373         * indented if the indentation is set to zero.
374         * Calling {@link #setIndenting} will reset this
375         * value to zero (off) or the default (on).
376         *
377         * @param indent The indentation, or zero
378         */
379        public void setIndent( int indent )
380        {
381            if ( indent < 0 )
382                this.indent = 0;
383            else
384                this.indent = indent;
385        }
386    
387    
388        /**
389         * Sets the indentation on and off. When set on, the default
390         * indentation level and default line wrapping is used
391         * (see {@link #DEFAULT_INDENT} and {@link #DEFAULT_LINE_WIDTH}).
392         * To specify a different indentation level or line wrapping,
393         * use {@link #setIndent} and {@link #setLineWidth}.
394         *
395         * @param on True if indentation should be on
396         */
397        public void setIndenting( boolean on )
398        {
399            if ( on ) {
400                indent = Defaults.Indent;
401                _lineWidth = Defaults.LineWidth;
402            } else {
403                indent = 0;
404                _lineWidth = 0;
405            }
406        }
407    
408    
409        /**
410         * Returns the specified encoding. If no encoding was
411         * specified, the default is always "UTF-8".
412         *
413         * @return The encoding
414         */
415        public String getEncoding()
416        {
417            return encoding;
418        }
419    
420    
421        /**
422         * Sets the encoding for this output method. If no
423         * encoding was specified, the default is always "UTF-8".
424         * Make sure the encoding is compatible with the one
425         * used by the {@link java.io.Writer}.
426         *
427         * @see #getEncoding
428         * @param encoding The encoding, or null
429         */
430        public void setEncoding( String encoding )
431        {
432            this.encoding = encoding;
433            encodingInfo = null;
434        }
435    
436        /**
437         * Sets the encoding for this output method with an <code>EncodingInfo</code>
438         * instance.
439         */
440        public void setEncoding(EncodingInfo encInfo) {
441            encoding = encInfo.getName();
442            encodingInfo = encInfo;
443        }
444    
445        /**
446         * Returns an <code>EncodingInfo<code> instance for the encoding.
447         *
448         * @see setEncoding
449         */
450        public EncodingInfo getEncodingInfo() {
451            if (encodingInfo == null)
452                encodingInfo = Encodings.getEncodingInfo(encoding);
453            return encodingInfo;
454        }
455    
456        /**
457         * Returns the specified media type, or null.
458         * To determine the media type based on the
459         * document type, use {@link #whichMediaType}.
460         *
461         * @return The specified media type, or null
462         */
463        public String getMediaType()
464        {
465            return mediaType;
466        }
467    
468    
469        /**
470         * Sets the media type.
471         *
472         * @see #getMediaType
473         * @param mediaType The specified media type
474         */
475        public void setMediaType( String mediaType )
476        {
477            this.mediaType = mediaType;
478        }
479    
480    
481        /**
482         * Sets the document type public and system identifiers.
483         * Required only if the DOM Document or SAX events do not
484         * specify the document type, and one must be present in
485         * the serialized document. Any document type specified
486         * by the DOM Document or SAX events will override these
487         * values.
488         *
489         * @param publicId The public identifier, or null
490         * @param systemId The system identifier, or null
491         */
492        public void setDoctype( String publicId, String systemId )
493        {
494            doctypePublic = publicId;
495            doctypeSystem = systemId;
496        }
497    
498    
499        /**
500         * Returns the specified document type public identifier,
501         * or null.
502         */
503        public String getDoctypePublic()
504        {
505            return doctypePublic;
506        }
507    
508    
509        /**
510         * Returns the specified document type system identifier,
511         * or null.
512         */
513        public String getDoctypeSystem()
514        {
515            return doctypeSystem;
516        }
517    
518    
519        /**
520         * Returns true if comments should be ommited.
521         * The default is false.
522         */
523        public boolean getOmitComments()
524        {
525            return omitComments;
526        }
527    
528    
529        /**
530         * Sets comment omitting on and off.
531         *
532         * @param omit True if comments should be ommited
533         */
534        public void setOmitComments( boolean omit )
535        {
536            omitComments = omit;
537        }
538    
539    
540        /**
541         * Returns true if the DOCTYPE declaration should
542         * be ommited. The default is false.
543         */
544        public boolean getOmitDocumentType()
545        {
546            return omitDoctype;
547        }
548    
549    
550        /**
551         * Sets DOCTYPE declaration omitting on and off.
552         *
553         * @param omit True if DOCTYPE declaration should be ommited
554         */
555        public void setOmitDocumentType( boolean omit )
556        {
557            omitDoctype = omit;
558        }
559    
560    
561        /**
562         * Returns true if the XML document declaration should
563         * be ommited. The default is false.
564         */
565        public boolean getOmitXMLDeclaration()
566        {
567            return omitXmlDeclaration;
568        }
569    
570    
571        /**
572         * Sets XML declaration omitting on and off.
573         *
574         * @param omit True if XML declaration should be ommited
575         */
576        public void setOmitXMLDeclaration( boolean omit )
577        {
578            omitXmlDeclaration = omit;
579        }
580    
581    
582        /**
583         * Returns true if the document type is standalone.
584         * The default is false.
585         */
586        public boolean getStandalone()
587        {
588            return standalone;
589        }
590    
591    
592        /**
593         * Sets document DTD standalone. The public and system
594         * identifiers must be null for the document to be
595         * serialized as standalone.
596         *
597         * @param standalone True if document DTD is standalone
598         */
599        public void setStandalone( boolean standalone )
600        {
601            this.standalone = standalone;
602        }
603    
604    
605        /**
606         * Returns a list of all the elements whose text node children
607         * should be output as CDATA, or null if no such elements were
608         * specified.
609         */
610        public String[] getCDataElements()
611        {
612            return cdataElements;
613        }
614    
615    
616        /**
617         * Returns true if the text node children of the given elements
618         * should be output as CDATA.
619         *
620         * @param tagName The element's tag name
621         * @return True if should serialize as CDATA
622         */
623        public boolean isCDataElement( String tagName )
624        {
625            int i;
626    
627            if ( cdataElements == null )
628                return false;
629            for ( i = 0 ; i < cdataElements.length ; ++i )
630                if ( cdataElements[ i ].equals( tagName ) )
631                    return true;
632            return false;
633        }
634    
635    
636        /**
637         * Sets the list of elements for which text node children
638         * should be output as CDATA.
639         *
640         * @param cdataElements List of CDATA element tag names
641         */
642        public void setCDataElements( String[] cdataElements )
643        {
644            this.cdataElements = cdataElements;
645        }
646    
647    
648        /**
649         * Returns a list of all the elements whose text node children
650         * should be output unescaped (no character references), or null
651         * if no such elements were specified.
652         */
653        public String[] getNonEscapingElements()
654        {
655            return nonEscapingElements;
656        }
657    
658    
659        /**
660         * Returns true if the text node children of the given elements
661         * should be output unescaped.
662         *
663         * @param tagName The element's tag name
664         * @return True if should serialize unescaped
665         */
666        public boolean isNonEscapingElement( String tagName )
667        {
668            int i;
669    
670            if ( nonEscapingElements == null )
671                return false;
672            for ( i = 0 ; i < nonEscapingElements.length ; ++i )
673                if ( nonEscapingElements[ i ].equals( tagName ) )
674                    return true;
675            return false;
676        }
677    
678    
679        /**
680         * Sets the list of elements for which text node children
681         * should be output unescaped (no character references).
682         *
683         * @param nonEscapingElements List of unescaped element tag names
684         */
685        public void setNonEscapingElements( String[] nonEscapingElements )
686        {
687            this.nonEscapingElements = nonEscapingElements;
688        }
689    
690    
691    
692        /**
693         * Returns a specific line separator to use. The default is the
694         * Web line separator (<tt>\n</tt>). A string is returned to
695         * support double codes (CR + LF).
696         *
697         * @return The specified line separator
698         */
699        public String getLineSeparator()
700        {
701            return lineSeparator;
702        }
703    
704    
705        /**
706         * Sets the line separator. The default is the Web line separator
707         * (<tt>\n</tt>). The machine's line separator can be obtained
708         * from the system property <tt>line.separator</tt>, but is only
709         * useful if the document is edited on machines of the same type.
710         * For general documents, use the Web line separator.
711         *
712         * @param lineSeparator The specified line separator
713         */
714        public void setLineSeparator( String lineSeparator )
715        {
716            if ( lineSeparator == null )
717                this.lineSeparator =  "\n";
718            else
719                this.lineSeparator = lineSeparator;
720        }
721    
722    
723        /**
724         * Returns true if the default behavior for this format is to
725         * preserve spaces. All elements that do not specify otherwise
726         * or specify the default behavior will be formatted based on
727         * this rule. All elements that specify space preserving will
728         * always preserve space.
729         */
730        public boolean getPreserveSpace()
731        {
732            return preserve;
733        }
734    
735    
736        /**
737         * Sets space preserving as the default behavior. The default is
738         * space stripping and all elements that do not specify otherwise
739         * or use the default value will not preserve spaces.
740         *
741         * @param preserve True if spaces should be preserved
742         */
743        public void setPreserveSpace( boolean preserve )
744        {
745            this.preserve = preserve;
746        }
747    
748    
749        /**
750         * Return the selected line width for breaking up long lines.
751         * When indenting, and only when indenting, long lines will be
752         * broken at space boundaries based on this line width.
753         * No line wrapping occurs if this value is zero.
754         */
755        public int getLineWidth()
756        {
757            return _lineWidth;
758        }
759    
760    
761        /**
762         * Sets the line width. If zero then no line wrapping will
763         * occur. Calling {@link #setIndenting} will reset this
764         * value to zero (off) or the default (on).
765         *
766         * @param lineWidth The line width to use, zero for default
767         * @see #getLineWidth
768         * @see #setIndenting
769         */
770        public void setLineWidth( int lineWidth )
771        {
772            if ( lineWidth <= 0 )
773                _lineWidth = 0;
774            else
775                _lineWidth = lineWidth;
776        }
777    
778        /**
779         * Returns the preserveEmptyAttribute flag. If flag is false, then'
780         * attributes with empty string values are output as the attribute 
781         * name only (in HTML mode).
782         * @return preserve the preserve flag
783         */
784        public boolean getPreserveEmptyAttributes () {
785            return preserveEmptyAttributes;
786        }
787        /**
788         * Sets the preserveEmptyAttribute flag. If flag is false, then'
789         * attributes with empty string values are output as the attribute 
790         * name only (in HTML mode).
791         * @param preserve the preserve flag
792         */
793        public void setPreserveEmptyAttributes (boolean preserve) {
794            preserveEmptyAttributes = preserve;
795        }
796    
797        /**
798         * Returns the last printable character based on the selected
799         * encoding. Control characters and non-printable characters
800         * are always printed as character references.
801         */
802        public char getLastPrintable()
803        {
804            if ( getEncoding() != null &&
805                 ( getEncoding().equalsIgnoreCase( "ASCII" ) ) )
806                return 0xFF;
807            else
808                return 0xFFFF;
809        }
810    
811    
812        /**
813         * Determine the output method for the specified document.
814         * If the document is an instance of {@link org.w3c.dom.html.HTMLDocument}
815         * then the method is said to be <tt>html</tt>. If the root
816         * element is 'html' and all text nodes preceding the root
817         * element are all whitespace, then the method is said to be
818         * <tt>html</tt>. Otherwise the method is <tt>xml</tt>.
819         *
820         * @param doc The document to check
821         * @return The suitable method
822         */
823        public static String whichMethod( Document doc )
824        {
825            Node    node;
826            String  value;
827            int     i;
828    
829            // If document is derived from HTMLDocument then the default
830            // method is html.
831            if ( doc instanceof HTMLDocument )
832                return Method.HTML;
833    
834            // Lookup the root element and the text nodes preceding it.
835            // If root element is html and all text nodes contain whitespace
836            // only, the method is html.
837    
838            // FIXME (SM) should we care about namespaces here?
839    
840            node = doc.getFirstChild();
841            while (node != null) {
842                // If the root element is html, the method is html.
843                if ( node.getNodeType() == Node.ELEMENT_NODE ) {
844                    if ( node.getNodeName().equalsIgnoreCase( "html" ) ) {
845                        return Method.HTML;
846                    } else if ( node.getNodeName().equalsIgnoreCase( "root" ) ) {
847                        return Method.FOP;
848                    } else {
849                        return Method.XML;
850                    }
851                } else if ( node.getNodeType() == Node.TEXT_NODE ) {
852                    // If a text node preceding the root element contains
853                    // only whitespace, this might be html, otherwise it's
854                    // definitely xml.
855                    value = node.getNodeValue();
856                    for ( i = 0 ; i < value.length() ; ++i )
857                        if ( value.charAt( i ) != 0x20 && value.charAt( i ) != 0x0A &&
858                             value.charAt( i ) != 0x09 && value.charAt( i ) != 0x0D )
859                            return Method.XML;
860                }
861                node = node.getNextSibling();
862            }
863            // Anything else, the method is xml.
864            return Method.XML;
865        }
866    
867    
868        /**
869         * Returns the document type public identifier
870         * specified for this document, or null.
871         */
872        public static String whichDoctypePublic( Document doc )
873        {
874            DocumentType doctype;
875    
876               /*  DOM Level 2 was introduced into the code base*/
877               doctype = doc.getDoctype();
878               if ( doctype != null ) {
879               // Note on catch: DOM Level 1 does not specify this method
880               // and the code will throw a NoSuchMethodError
881               try {
882               return doctype.getPublicId();
883               } catch ( Error except ) {  }
884               }
885            
886            if ( doc instanceof HTMLDocument )
887                return DTD.XHTMLPublicId;
888            return null;
889        }
890    
891    
892        /**
893         * Returns the document type system identifier
894         * specified for this document, or null.
895         */
896        public static String whichDoctypeSystem( Document doc )
897        {
898            DocumentType doctype;
899    
900            /* DOM Level 2 was introduced into the code base*/
901               doctype = doc.getDoctype();
902               if ( doctype != null ) {
903               // Note on catch: DOM Level 1 does not specify this method
904               // and the code will throw a NoSuchMethodError
905               try {
906               return doctype.getSystemId();
907               } catch ( Error except ) { }
908               }
909            
910            if ( doc instanceof HTMLDocument )
911                return DTD.XHTMLSystemId;
912            return null;
913        }
914    
915    
916        /**
917         * Returns the suitable media format for a document
918         * output with the specified method.
919         */
920        public static String whichMediaType( String method )
921        {
922            if ( method.equalsIgnoreCase( Method.XML ) )
923                return "text/xml";
924            if ( method.equalsIgnoreCase( Method.HTML ) )
925                return "text/html";
926            if ( method.equalsIgnoreCase( Method.XHTML ) )
927                return "text/html";
928            if ( method.equalsIgnoreCase( Method.TEXT ) )
929                return "text/plain";
930            if ( method.equalsIgnoreCase( Method.FOP ) )
931                return "application/pdf";
932            return null;
933        }
934    
935    
936    }
937