com.wooki.services.parsers.DocumentToXHTML.java Source code

Java tutorial

Introduction

Here is the source code for com.wooki.services.parsers.DocumentToXHTML.java

Source

//
// Copyright 2009 Robin Komiwes, Bruno Verachten, Christophe Cordenier
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

/*
 * $Id: DocumentToXHTML.java,v 1.6 2002/08/09 14:28:44 bveracht Exp $
 * $Log: DocumentToXHTML.java,v $
 * Revision 1.6  2002/08/09 14:28:44  bveracht
 * Added a XML header for the clean.xhtml file.
 * This way, the   bug disappears.
 *
 * Revision 1.5  2002/04/11 15:07:41  bveracht
 * Clean up
 *
 */
package com.wooki.services.parsers;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;

import org.apache.log4j.Logger;
import org.springframework.core.io.Resource;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;

/**
 * This class cleans an HTML file to produce XHTML
 */
public class DocumentToXHTML implements Convertor {

    private boolean xmlOut;

    private Logger logger = Logger.getLogger(DocumentToXHTML.class);

    public InputStream performTransformation(Resource xmlDocument) {
        BufferedInputStream in;
        BufferedOutputStream out;
        ByteArrayOutputStream result;
        Tidy tidy = new Tidy();
        java.util.Properties props = new java.util.Properties();

        props.setProperty("new-inline-tags",
                "page-break,page-number,page-numbers,wooki,xsl:value-of,xsl:for-each,quote");
        props.setProperty("new-blocklevel-tags", "for,page-header,page-footer,xsl:value-of,xsl:for-each");
        props.setProperty("new-empty-tags", "page-break,page-number,page-numbers,xsl:value-of");
        // props.setProperty("new-pre-tags", "for,header,footer");
        props.setProperty("new-pre-tags", "wooki");
        tidy.setConfigurationFromProps(props);
        // tidy.setDocType("omit");
        tidy.setXmlOut(xmlOut);
        tidy.setXHTML(true);
        tidy.setEmacs(true);
        tidy.setErrfile("tidyErrors.txt");
        tidy.setFixBackslash(true);
        tidy.setNumEntities(true);
        tidy.setQuoteNbsp(false);
        tidy.setCharEncoding(Configuration.LATIN1);
        // tidy.setInputEncoding("ISO-8859-2");
        tidy.setFixComments(true);
        tidy.setQuoteAmpersand(false);
        tidy.setEncloseText(true);
        tidy.setEncloseBlockText(true);
        // tidy.setWord2000(true);

        try {
            tidy.setErrout(new PrintWriter(new FileWriter("tidyErrors.txt"), true));

            in = new BufferedInputStream(xmlDocument.getInputStream());
            out = new BufferedOutputStream(result = new ByteArrayOutputStream());
            byte[] XMLHeader = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n".getBytes();
            out.write(XMLHeader, 0, XMLHeader.length);

            tidy.parse(in, out);
            out.flush();
            return new ByteArrayInputStream(result.toByteArray());
        } catch (IOException ioe) {
            ioe.printStackTrace();
            logger.error(ioe.getLocalizedMessage());
            return null;
        }
    }

}