org.trancecode.xproc.step.UnEscapeMarkupStepProcessor.java Source code

Java tutorial

Introduction

Here is the source code for org.trancecode.xproc.step.UnEscapeMarkupStepProcessor.java

Source

/*
 * Copyright (C) 2011 Emmanuel Tourdot
 *
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or (at your option)
 * any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 
 *
 * $Id$
 */
package org.trancecode.xproc.step;

import java.io.IOException;
import java.io.StringReader;
import java.util.EnumSet;
import javax.mail.internet.ContentType;
import javax.mail.internet.ParseException;
import javax.xml.transform.sax.SAXSource;
import net.sf.saxon.s9api.DocumentBuilder;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmNodeKind;
import org.apache.commons.lang.StringEscapeUtils;
import org.ccil.cowan.tagsoup.Parser;
import org.trancecode.io.MediaTypes;
import org.trancecode.logging.Logger;
import org.trancecode.xml.saxon.CopyingSaxonProcessorDelegate;
import org.trancecode.xml.saxon.Saxon;
import org.trancecode.xml.saxon.SaxonAxis;
import org.trancecode.xml.saxon.SaxonBuilder;
import org.trancecode.xml.saxon.SaxonLocation;
import org.trancecode.xml.saxon.SaxonProcessor;
import org.trancecode.xml.saxon.SaxonProcessorDelegate;
import org.trancecode.xproc.XProcExceptions;
import org.trancecode.xproc.port.XProcPorts;
import org.trancecode.xproc.variable.XProcOptions;
import org.xml.sax.InputSource;

/**
 * @author Emmanuel Tourdot
 */
@ExternalResources(read = false, write = false)
public final class UnEscapeMarkupStepProcessor extends AbstractStepProcessor {
    private static final Logger LOG = Logger.getLogger(UnEscapeMarkupStepProcessor.class);

    @Override
    public QName getStepType() {
        return XProcSteps.UNESCAPE_MARKUP;
    }

    @Override
    protected void execute(final StepInput input, final StepOutput output) {
        final XdmNode sourceDocument = input.readNode(XProcPorts.SOURCE);

        final String namespaceOption = input.getOptionValue(XProcOptions.NAMESPACE, null);
        // Resolve namespace
        Steps.getUri(namespaceOption);
        final String contentTypeOption = input.getOptionValue(XProcOptions.CONTENT_TYPE, MediaTypes.MEDIA_XML);
        final String encodingOption = input.getOptionValue(XProcOptions.ENCODING, null);
        if (encodingOption != null && !Steps.ENCODING_BASE64.equals(encodingOption)) {
            throw XProcExceptions.xc0052(SaxonLocation.of(sourceDocument));
        }
        final ContentType contentType = getContentType(contentTypeOption, input.getStep());
        final String charsetOption = input.getOptionValue(XProcOptions.CHARSET, null);
        final String charset = (charsetOption == null) ? contentType.getParameter("charset") : charsetOption;

        if (Steps.ENCODING_BASE64.equals(encodingOption)) {
            if (charset == null) {
                throw XProcExceptions.xc0010(input.getStep().getNode());
            } else {
                final SaxonBuilder builder = new SaxonBuilder(
                        input.getPipelineContext().getProcessor().getUnderlyingConfiguration());
                builder.startDocument();
                final Iterable<XdmNode> childNodes = SaxonAxis.childElements(sourceDocument);
                for (final XdmNode aNode : childNodes) {
                    if (XdmNodeKind.ELEMENT.equals(aNode.getNodeKind())) {
                        final String unEscapeContent;
                        try {
                            unEscapeContent = getUnEscapeContent(aNode.getStringValue(), encodingOption,
                                    contentType, charset);
                        } catch (IOException e) {
                            throw XProcExceptions.xc0010(aNode);
                        }
                        builder.startElement(aNode.getNodeName(), aNode);
                        for (final XdmNode attribute : SaxonAxis.attributes(aNode)) {
                            LOG.trace("copy existing attribute: {}", attribute);
                            builder.attribute(attribute.getNodeName(), attribute.getStringValue());
                        }
                        if (MediaTypes.MEDIA_TYPE_HTML.equals(contentType.getBaseType())) {
                            writeHtmlNodes(unEscapeContent, namespaceOption,
                                    input.getPipelineContext().getProcessor(), builder);
                        } else {
                            writeXmlNodes(unEscapeContent, namespaceOption,
                                    input.getPipelineContext().getProcessor(), builder);
                        }
                        builder.endElement();
                    } else {
                        builder.nodes(aNode);
                    }
                }
                builder.endDocument();
                output.writeNodes(XProcPorts.RESULT, builder.getNode());
            }
        } else {
            final SaxonProcessorDelegate escapeDelegate = new CopyingSaxonProcessorDelegate() {
                @Override
                public EnumSet<NextSteps> startElement(final XdmNode node, final SaxonBuilder builder) {
                    builder.startElement(node.getNodeName(), node);
                    return EnumSet.of(NextSteps.PROCESS_ATTRIBUTES, NextSteps.PROCESS_CHILDREN,
                            NextSteps.START_CONTENT);
                }

                @Override
                public void text(final XdmNode node, final SaxonBuilder builder) {
                    final String unEscapeContent;
                    try {
                        unEscapeContent = getUnEscapeContent(node.getStringValue(), encodingOption, contentType,
                                charset);
                    } catch (IOException e) {
                        throw XProcExceptions.xc0010(node);
                    }
                    if (MediaTypes.MEDIA_TYPE_HTML.equals(contentType.getBaseType())) {
                        writeHtmlNodes(unEscapeContent, namespaceOption, input.getPipelineContext().getProcessor(),
                                builder);
                    } else {
                        writeXmlNodes(unEscapeContent, namespaceOption, input.getPipelineContext().getProcessor(),
                                builder);
                    }
                }
            };
            final SaxonProcessor escapeProcessor = new SaxonProcessor(input.getPipelineContext().getProcessor(),
                    escapeDelegate);

            final XdmNode result = escapeProcessor.apply(sourceDocument);
            output.writeNodes(XProcPorts.RESULT, result);
        }
    }

    private static void writeXmlNodes(final String unEscapeContent, final String namespaceOption,
            final Processor processor, final SaxonBuilder builder) {
        final XdmNode parsedNode = Saxon.parse("<z>" + unEscapeContent + "</z>", processor);
        final Iterable<XdmNode> childNodes = SaxonAxis.childNodes(SaxonAxis.childElement(parsedNode));
        for (final XdmNode aNode : childNodes) {
            if (XdmNodeKind.ELEMENT.equals(aNode.getNodeKind())) {
                final QName nameNode = new QName("", namespaceOption, aNode.getNodeName().getLocalName());
                builder.startElement(nameNode, aNode);
                builder.nodes(SaxonAxis.childNodes(aNode));
                builder.endElement();
            } else {
                builder.nodes(aNode);
            }
        }
    }

    private static void writeHtmlNodes(final String unEscapeContent, final String namespaceOption,
            final Processor processor, final SaxonBuilder builder) {
        try {
            final StringReader inputStream = new StringReader(unEscapeContent);
            final InputSource source = new InputSource(inputStream);
            final Parser parser = new Parser();
            final SAXSource saxSource = new SAXSource(parser, source);
            final DocumentBuilder docBuilder = processor.newDocumentBuilder();
            final XdmNode aNode = docBuilder.build(saxSource);
            builder.namespace("", namespaceOption);
            builder.nodes(aNode);
        } catch (final SaxonApiException e) {
            throw XProcExceptions.xc0051(null);
        }
    }

    private static ContentType getContentType(final String content_type, final Step inputStep) {
        try {
            final ContentType ct = new ContentType(content_type);
            if (!Steps.SUPPORTED_CONTENTTYPE.contains(ct.getBaseType())) {
                throw XProcExceptions.xc0051(inputStep.getLocation());
            }
            return ct;
        } catch (final ParseException e) {
            throw XProcExceptions.xc0051(inputStep.getLocation());
        }
    }

    private static String getUnEscapeContent(final String content, final String encoding,
            final ContentType contentType, final String charset) throws IOException {
        if (Steps.ENCODING_BASE64.equals(encoding)) {
            return Steps.getBase64Content(content, charset);
        } else if (MediaTypes.MEDIA_XML.equals(contentType.getBaseType())) {
            return StringEscapeUtils.unescapeHtml(content);
        } else if (MediaTypes.MEDIA_TYPE_HTML.equals(contentType.getBaseType())) {
            return StringEscapeUtils.unescapeXml(content);
        }
        return "";
    }
}