com.aipo.container.gadgets.parse.AipoNekoSimplifiedHtmlParser.java Source code

Java tutorial

Introduction

Here is the source code for com.aipo.container.gadgets.parse.AipoNekoSimplifiedHtmlParser.java

Source

/*
 * Aipo is a groupware program developed by Aimluck,Inc.
 * Copyright (C) 2004-2015 Aimluck,Inc.
 * http://www.aipo.com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.aipo.container.gadgets.parse;

import java.io.IOException;
import java.io.StringReader;
import java.util.Map;
import java.util.Stack;

import org.apache.commons.lang.StringUtils;
import org.apache.shindig.common.xml.DomUtil;
import org.apache.shindig.gadgets.GadgetException;
import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
import org.apache.shindig.gadgets.parse.SocialDataTags;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLDocumentSource;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLElements;
import org.cyberneko.html.HTMLEntities;
import org.cyberneko.html.HTMLScanner;
import org.cyberneko.html.HTMLTagBalancer;
import org.cyberneko.html.filters.NamespaceBinder;
import org.w3c.dom.DOMException;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import com.google.common.collect.ImmutableMap;
import com.google.inject.Inject;
import com.google.inject.Singleton;

/**
 * Supports parsing of social markup blocks inside gadget content.
 * &lt;script&gt; elements with types of either "text/os-template" or
 * "text/os-data" are parsed inline into contained DOM hierarchies for
 * subsequent processing by the pipeline and template rewriters.
 */
@Singleton
public class AipoNekoSimplifiedHtmlParser extends GadgetHtmlParser {

    private static final HTMLElements.Element OSML_TEMPLATE_ELEMENT;

    private static final HTMLElements.Element OSML_DATA_ELEMENT;

    static {
        HTMLElements.Element unknown = HTMLElements.getElement(HTMLElements.UNKNOWN);
        OSML_TEMPLATE_ELEMENT = new HTMLElements.Element(unknown.code, SocialDataTags.OSML_TEMPLATE_TAG,
                unknown.flags, HTMLElements.BODY, unknown.closes);
        // Passing parent in constructor is ignored.
        // Only allow template tags in BODY
        OSML_TEMPLATE_ELEMENT.parent = new HTMLElements.Element[] { HTMLElements.getElement(HTMLElements.BODY) };

        // data tags are allowed in BODY only, since Neko disallows HEAD elements
        // from
        // having child elements of their own.
        OSML_DATA_ELEMENT = new HTMLElements.Element(unknown.code, SocialDataTags.OSML_TEMPLATE_TAG, unknown.flags,
                HTMLElements.BODY, unknown.closes);
        OSML_DATA_ELEMENT.parent = new HTMLElements.Element[] { HTMLElements.getElement(HTMLElements.BODY) };
    }

    private static final Map<String, HTMLElements.Element> OSML_ELEMENTS = ImmutableMap.of(
            SocialDataTags.OSML_TEMPLATE_TAG, OSML_TEMPLATE_ELEMENT, SocialDataTags.OSML_DATA_TAG,
            OSML_DATA_ELEMENT);

    @Inject
    public AipoNekoSimplifiedHtmlParser(DOMImplementation documentFactory) {
        super(documentFactory);
    }

    @Override
    protected Document parseDomImpl(String source) throws GadgetException {
        DocumentHandler handler;

        HTMLConfiguration config = newConfiguration();
        try {
            handler = parseHtmlImpl(source, config, new NormalizingTagBalancer());
        } catch (IOException ioe) {
            return null;
        }

        Document document = handler.getDocument();
        document.appendChild(DomUtil.getFirstNamedChildNode(handler.getFragment(), "html"));
        fixNekoWeirdness(document);
        return document;
    }

    @Override
    protected DocumentFragment parseFragmentImpl(String source) throws GadgetException {
        DocumentHandler handler;

        HTMLConfiguration config = newConfiguration();
        // http://cyberneko.org/html/features/balance-tags/document-fragment
        // deprecated http://cyberneko.org/html/features/document-fragment
        config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
        config.setProperty("http://cyberneko.org/html/properties/balance-tags/fragment-context-stack",
                new QName[] { new QName(null, "HTML", "HTML", null), new QName(null, "BODY", "BODY", null) });

        try {
            handler = parseHtmlImpl(source, config, new NekoPatchTagBalancer());
        } catch (IOException ioe) {
            return null;
        }

        return handler.getFragment();
    }

    /**
     * Parse HTML source.
     * 
     * @return a document handler containing the parsed source
     */
    private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config,
            NormalizingTagBalancer tagBalancer) throws IOException {

        HTMLScanner htmlScanner = new HTMLScanner();
        tagBalancer.setScanner(htmlScanner);

        DocumentHandler handler = newDocumentHandler(source);

        NamespaceBinder namespaceBinder = new NamespaceBinder();
        namespaceBinder.setDocumentHandler(handler);
        namespaceBinder.setDocumentSource(tagBalancer);
        namespaceBinder.reset(config);
        tagBalancer.setDocumentHandler(namespaceBinder);

        // Order of filter is Scanner -> OSMLFilter -> Tag Balancer
        tagBalancer.setDocumentSource(htmlScanner);
        htmlScanner.setDocumentHandler(tagBalancer);

        tagBalancer.reset(config);
        htmlScanner.reset(config);

        XMLInputSource inputSource = new XMLInputSource(null, null, null);
        inputSource.setEncoding("UTF-8");
        inputSource.setCharacterStream(new StringReader(source));
        htmlScanner.setInputSource(inputSource);
        htmlScanner.scanDocument(true);
        return handler;
    }

    private void fixNekoWeirdness(Document document) {
        // Neko as of versions > 1.9.13 stuffs all leading <script> nodes into
        // <head>.
        // This breaks all sorts of assumptions in gadgets, notably the existence of
        // document.body.
        // We can't tell Neko to avoid putting <script> into <head> however, since
        // gadgets
        // like <Content><script>...</script><style>...</style> will break due to
        // both
        // <script> and <style> ending up in <body> -- at which point Neko
        // unceremoniously
        // drops the <style> (and <link>) elements.
        // Therefore we just search for <script> elements in <head> and stuff them
        // all into
        // the top of <body>.
        // This method assumes a normalized document as input.
        Node html = DomUtil.getFirstNamedChildNode(document, "html");
        if (html.getNextSibling() != null && html.getNextSibling().getNodeName().equalsIgnoreCase("html")) {
            // if a doctype is specified, then the desired root <html> node is wrapped
            // by an <HTML> node
            // Pull out the <html> root.
            html = html.getNextSibling();
        }
        Node head = DomUtil.getFirstNamedChildNode(html, "head");
        if (head == null) {
            head = document.createElement("head");
            html.insertBefore(head, html.getFirstChild());
        }
        NodeList headNodes = head.getChildNodes();
        Stack<Node> headScripts = new Stack<Node>();
        for (int i = 0; i < headNodes.getLength(); ++i) {
            Node headChild = headNodes.item(i);
            if (headChild.getNodeName().equalsIgnoreCase("script")) {
                headScripts.add(headChild);
            }
        }

        // Remove from head, add to top of <body> in <head> order.
        Node body = DomUtil.getFirstNamedChildNode(html, "body");
        if (body == null) {
            body = document.createElement("body");
            html.insertBefore(body, head.getNextSibling());
        }
        Node bodyFirst = body.getFirstChild();
        while (!headScripts.isEmpty()) {
            Node headScript = headScripts.pop();
            head.removeChild(headScript);
            body.insertBefore(headScript, bodyFirst);
            bodyFirst = headScript;
        }
    }

    protected HTMLConfiguration newConfiguration() {
        HTMLConfiguration config = new HTMLConfiguration();
        // Maintain original case for elements and attributes
        config.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
        config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
        // Get notified of entity and character references
        config.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
        config.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
        config.setFeature("http://xml.org/sax/features/namespaces", true);
        return config;
    }

    protected DocumentHandler newDocumentHandler(String source) {
        return new DocumentHandler(source);
    }

    /** Handler for XNI events from Neko */
    protected class DocumentHandler implements XMLDocumentHandler {

        private final Stack<Node> elementStack = new Stack<Node>();

        private final StringBuilder builder;

        private boolean inEntity = false;

        private DocumentFragment documentFragment;

        private Document document;

        public DocumentHandler(String content) {
            builder = new StringBuilder(content.length() / 10);
        }

        public DocumentFragment getFragment() {
            return documentFragment;
        }

        public Document getDocument() {
            return document;
        }

        public void startDocument(XMLLocator xmlLocator, String encoding, NamespaceContext namespaceContext,
                Augmentations augs) throws XNIException {
            document = documentFactory.createDocument(null, null,
                    documentFactory.createDocumentType("html", "-//W3C//DTD XHTML 1.0 Transitional//EN",
                            "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"));
            elementStack.clear();
            documentFragment = document.createDocumentFragment();
            elementStack.push(documentFragment);
        }

        public void xmlDecl(String version, String encoding, String standalone, Augmentations augs)
                throws XNIException {
            // Dont really do anything with this
            builder.append("<?xml");
            if (version != null) {
                builder.append(" version=\"").append(version).append('\"');
            }
            if (encoding != null) {
                builder.append(" encoding=\"").append(encoding).append('\"');
            }
            if (standalone != null) {
                builder.append(" standalone=\"").append(standalone).append('\"');
            }
            builder.append('>');
        }

        public void doctypeDecl(String rootElement, String publicId, String systemId, Augmentations augs)
                throws XNIException {
            document = documentFactory.createDocument(null, null,
                    documentFactory.createDocumentType(rootElement, publicId, systemId));
            elementStack.clear();
            documentFragment = document.createDocumentFragment();
            elementStack.push(documentFragment);
        }

        public void comment(XMLString text, Augmentations augs) throws XNIException {
            flushTextBuffer();

            // Add comments as comment nodes - needed to support sanitization
            // of SocialMarkup-parsed content
            Node comment = getDocument().createComment(new String(text.ch, text.offset, text.length));
            appendChild(comment);
        }

        public void processingInstruction(String s, XMLString xmlString, Augmentations augs) throws XNIException {
            // No-op
        }

        public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augs) throws XNIException {
            Element element = startElementImpl(qName, xmlAttributes);
            // Not an empty element, so push on the stack
            elementStack.push(element);
        }

        public void emptyElement(QName qName, XMLAttributes xmlAttributes, Augmentations augs) throws XNIException {
            startElementImpl(qName, xmlAttributes);
        }

        /**
         * Flush any existing text content to the document. Call this before
         * appending any nodes.
         */
        protected void flushTextBuffer() {
            if (builder.length() > 0) {
                appendChild(document.createTextNode(builder.toString()));
                builder.setLength(0);
            }
        }

        /** Create an Element in the DOM */
        private Element startElementImpl(QName qName, XMLAttributes xmlAttributes) {
            flushTextBuffer();

            Element element;
            // Preserve XML namespace if present
            if (qName.uri != null) {
                element = document.createElementNS(qName.uri, qName.rawname);
            } else {
                element = document.createElement(qName.rawname);
            }

            for (int i = 0; i < xmlAttributes.getLength(); i++) {
                if (xmlAttributes.getURI(i) != null) {
                    element.setAttributeNS(xmlAttributes.getURI(i), xmlAttributes.getQName(i),
                            xmlAttributes.getValue(i));
                } else {
                    try {
                        element.setAttribute(xmlAttributes.getLocalName(i), xmlAttributes.getValue(i));
                    } catch (DOMException e) {
                        switch (e.code) {
                        case DOMException.INVALID_CHARACTER_ERR:
                            StringBuilder sb = new StringBuilder(e.getMessage());
                            sb.append("Around ...<");
                            if (qName.prefix != null) {
                                sb.append(qName.prefix);
                                sb.append(':');
                            }
                            sb.append(qName.localpart);
                            for (int j = 0; j < xmlAttributes.getLength(); j++) {
                                if (StringUtils.isNotBlank(xmlAttributes.getLocalName(j))
                                        && StringUtils.isNotBlank(xmlAttributes.getValue(j))) {
                                    sb.append(' ');
                                    sb.append(xmlAttributes.getLocalName(j));
                                    sb.append("=\"");
                                    sb.append(xmlAttributes.getValue(j)).append('\"');
                                }
                            }
                            sb.append("...");
                            throw new DOMException(DOMException.INVALID_CHARACTER_ERR, sb.toString());
                        default:
                            throw e;
                        }
                    }
                }
            }
            appendChild(element);
            return element;
        }

        public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs)
                throws XNIException {
            if (name.startsWith("#")) {
                try {
                    boolean hex = name.startsWith("#x");
                    int offset = hex ? 2 : 1;
                    int base = hex ? 16 : 10;
                    int value = Integer.parseInt(name.substring(offset), base);
                    String entity = HTMLEntities.get(value);
                    if (entity != null) {
                        name = entity;
                    }
                } catch (NumberFormatException e) {
                    // ignore
                }
            }
            printEntity(name);
            inEntity = true;
        }

        private void printEntity(String name) {
            builder.append('&');
            builder.append(name);
            builder.append(';');
        }

        public void textDecl(String s, String s1, Augmentations augs) throws XNIException {
            builder.append(s);
        }

        public void endGeneralEntity(String s, Augmentations augs) throws XNIException {
            inEntity = false;
        }

        public void characters(XMLString text, Augmentations augs) throws XNIException {
            if (inEntity) {
                return;
            }
            builder.append(text.ch, text.offset, text.length);
        }

        public void ignorableWhitespace(XMLString text, Augmentations augs) throws XNIException {
            builder.append(text.ch, text.offset, text.length);
        }

        public void endElement(QName qName, Augmentations augs) throws XNIException {
            flushTextBuffer();
            elementStack.pop();
        }

        public void startCDATA(Augmentations augs) throws XNIException {
            // No-op
        }

        public void endCDATA(Augmentations augs) throws XNIException {
            // No-op
        }

        public void endDocument(Augmentations augs) throws XNIException {
            flushTextBuffer();
            elementStack.pop();
        }

        public void setDocumentSource(XMLDocumentSource xmlDocumentSource) {
        }

        public XMLDocumentSource getDocumentSource() {
            return null;
        }

        private void appendChild(Node node) {
            elementStack.peek().appendChild(node);
        }
    }

    /**
     * Used when parsing document fragments to correct a bug in Neko 1.9.13. We
     * use the
     * http://cyberneko.org/html/properties/balance-tags/fragment-context-stack
     * property of Neko to force the fragment to be parsed as if it were already
     * container in a body tag. This doesnt quite work together as without this
     * fix it will still introduce head tags if the first parsed tags are allowed
     * in a head tag. See
     * https://sourceforge.net/tracker/?func=detail&atid=952178&
     * aid=2870180&group_id=195122
     */
    private static class NekoPatchTagBalancer extends NormalizingTagBalancer {

        /**
         * Override the document start to record whether HTML, HEAD or BODY have
         * been seen
         */
        @Override
        public void startDocument(XMLLocator locator, String encoding, NamespaceContext nscontext,
                Augmentations augs) throws XNIException {

            super.startDocument(locator, encoding, nscontext, augs);
            for (int i = fElementStack.top - 1; i >= 0; i--) {
                fSeenAnything = true;
                if (fElementStack.data[i].element.code == HTMLElements.HTML) {
                    fSeenRootElement = true;
                }
                if (fElementStack.data[i].element.code == HTMLElements.HEAD) {
                    fSeenHeadElement = true;
                }
                if (fElementStack.data[i].element.code == HTMLElements.BODY) {
                    fSeenBodyElement = true;
                }
            }
        }
    }

    /**
     * Subclass of Neko's tag balancer that - Normalizes the case of forced html,
     * head and body tags when they don't exist in the original content. - Convert
     * script tags with type=os/* to OSData and OSTemplate. Record their text
     * content and force it to be reparsed. -
     */
    private static class NormalizingTagBalancer extends HTMLTagBalancer {

        private StringBuilder scriptContent;

        private HTMLScanner scanner;

        private QName currentOsmlTag;

        public NormalizingTagBalancer() {
        }

        public void setScanner(HTMLScanner scanner) {
            this.scanner = scanner;
        }

        @Override
        public void characters(XMLString text, Augmentations augs) throws XNIException {
            if (currentOsmlTag != null) {
                scriptContent.append(text.ch, text.offset, text.length);
            } else {
                super.characters(text, augs);
            }
        }

        @Override
        public void startElement(QName elem, XMLAttributes attrs, Augmentations augs) throws XNIException {
            // Normalize the case of forced-elements to lowercase for backward
            // compatability
            if (!fSeenRootElement && elem.rawname.equalsIgnoreCase("html")) {
                elem.localpart = "html";
                elem.rawname = "html";
            } else if (!fSeenHeadElement && elem.rawname.equalsIgnoreCase("head")) {
                elem.localpart = "head";
                elem.rawname = "head";
            } else if (!fSeenBodyElement && elem.rawname.equalsIgnoreCase("body")) {
                elem.localpart = "body";
                elem.rawname = "body";
            }

            // Convert script tags of an OSML type to OSTemplate/OSData tags
            if ("script".equalsIgnoreCase(elem.rawname)) {
                String value = attrs.getValue("type");
                String osmlTagName = SocialDataTags.SCRIPT_TYPE_TO_OSML_TAG.get(value);
                if (osmlTagName != null) {
                    if (currentOsmlTag != null) {
                        throw new XNIException("Nested OpenSocial script elements");
                    }
                    currentOsmlTag = new QName(null, osmlTagName, osmlTagName, null);
                    if (scriptContent == null) {
                        scriptContent = new StringBuilder();
                    }
                    // Remove the type attribute
                    attrs.removeAttributeAt(attrs.getIndex("type"));
                    super.startElement(currentOsmlTag, attrs, augs);
                    return;
                }
            }

            super.startElement(elem, attrs, augs);
        }

        @Override
        public void endElement(QName element, Augmentations augs) throws XNIException {
            if (currentOsmlTag != null && "script".equalsIgnoreCase(element.rawname)) {
                QName endingTag = currentOsmlTag;
                currentOsmlTag = null;

                XMLInputSource scriptSource = new XMLInputSource(null, null, null);
                scriptSource.setCharacterStream(new StringReader(scriptContent.toString()));
                scriptContent.setLength(0);

                // Evaluate the content of the script block immediately
                scanner.evaluateInputSource(scriptSource);

                super.endElement(endingTag, augs);
            } else {
                super.endElement(element, augs);
            }
        }

        @Override
        protected HTMLElements.Element getElement(QName elementName) {
            HTMLElements.Element osmlElement = OSML_ELEMENTS.get(elementName.localpart);
            if (osmlElement != null) {
                return osmlElement;
            }
            return super.getElement(elementName);
        }
    }
}