org.apache.any23.extractor.microdata.MicrodataParser.java Source code

Introduction

Here is the source code for org.apache.any23.extractor.microdata.MicrodataParser.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.any23.extractor.microdata;

import org.apache.any23.extractor.html.DomUtils;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.TreeWalker;

import java.io.PrintStream;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * This class provides utility methods for handling <b>Microdata</b>
 * nodes contained within a <i>DOM</i> document.
 *
 * @author Michele Mostarda (mostarda@fbk.eu)
 */
public class MicrodataParser {

    enum ErrorMode {
        /** This mode raises an exception at first encountered error. */
        STOP_AT_FIRST_ERROR,
        /**  This mode produces a full error report. */
        FULL_REPORT
    }

    private final Document document;

    /**
     * This set holds the name of properties being dereferenced.
     * The {@link #deferProperties(Node, String...)} checks first if the
     * required dereference has been already asked, if so raises
     * a loop detection error. This map works in coordination
     * with {@link #dereferenceRecursionCounter}, so that at the end of
     * {@link #deferProperties(Node, String...)} call recursion the
     * {@link #loopDetectorSet} can be cleaned up.
     */
    private final Set<String> loopDetectorSet = new HashSet<>();

    /**
     * {@link ItemScope} cache.
     */
    private final Map<Node, ItemScope> itemScopes = new HashMap<>();

    /**
     * {@link ItemPropValue} cache.
     */
    private final Map<Node, ItemPropValue> itemPropValues = new HashMap<>();

    /**
      * Counts the recursive call of {@link #deferProperties(Node, String...)}.
      * It helps to cleanup the {@link #loopDetectorSet} when recursion ends.
      */
    private int dereferenceRecursionCounter = 0;

    /**
     * Current error mode.
     */
    private ErrorMode errorMode = ErrorMode.FULL_REPORT;

    /**
     * List of collected errors. Used when {@link #errorMode} <code>==</code> {@link ErrorMode#FULL_REPORT}.
     */
    private final List<MicrodataParserException> errors = new ArrayList<>();

    public static final String ITEMSCOPE_ATTRIBUTE = "itemscope";
    public static final String ITEMPROP_ATTRIBUTE = "itemprop";

    /**
     * List of tags providing the <code>src</code> property.
     */
    public static final Set<String> SRC_TAGS = Collections
            .unmodifiableSet(new HashSet<String>(Arrays.asList("audio", "embed", "frame", "iframe", "img", "source",
                    "track", "video", "input", "layer", "script", "textarea")));

    /**
     * List of tags providing the <code>href</code> property.
     */
    public static final Set<String> HREF_TAGS = Collections
            .unmodifiableSet(new HashSet<String>(Arrays.asList("a", "area", "link")));

    public MicrodataParser(Document document) {
        if (document == null) {
            throw new NullPointerException("Document cannot be null.");
        }
        this.document = document;
    }

    /**
     * Returns all the <i>itemScope</i>s detected within the given root node.
     *
     * @param node root node to search in.
     * @return list of detected items.
     */
    public static List<Node> getItemScopeNodes(Node node) {
        return DomUtils.findAllByAttributeName(node, ITEMSCOPE_ATTRIBUTE);
    }

    /**
     * Check whether a node is an <i>itemScope</i>.
     *
     * @param node node to check.
     * @return <code>true</code> if the node is an <i>itemScope</i>., <code>false</code> otherwise.
     */
    public static boolean isItemScope(Node node) {
        return DomUtils.readAttribute(node, ITEMSCOPE_ATTRIBUTE, null) != null;
    }

    /**
     * Returns all the <i>itemProp</i>s detected within the given root node.
     *
     * @param node root node to search in.
     * @return list of detected items.
     */
    public static List<Node> getItemPropNodes(Node node) {
        return DomUtils.findAllByAttributeName(node, ITEMPROP_ATTRIBUTE);
    }

    /**
     * Check whether a node is an <i>itemProp</i>.
     *
     * @param node node to check.
     * @return <code>true</code> if the node is an <i>itemProp</i>., <code>false</code> otherwise.
     */
    public static boolean isItemProp(Node node) {
        return DomUtils.readAttribute(node, ITEMPROP_ATTRIBUTE, null) != null;
    }

    /**
     * Returns only the <i>itemScope</i>s that are top level items.
     *
     * @param node root node to search in.
     * @return list of detected top item scopes.
     */
    public static List<Node> getTopLevelItemScopeNodes(Node node) {
        final List<Node> itemScopes = getItemScopeNodes(node);
        final List<Node> topLevelItemScopes = new ArrayList<>();
        for (Node itemScope : itemScopes) {
            if (!isItemProp(itemScope)) {
                topLevelItemScopes.add(itemScope);
            }
        }
        // ANY23-131 Nested Microdata are not extracted
        //return getUnnestedNodes( topLevelItemScopes );
        return topLevelItemScopes;
    }

    /**
     * Returns all the <b>Microdata items</b> detected within the given <code>document</code>.
     *
     * @param document document to be processed.
     * @param errorMode error management policy.
     * @return list of <b>itemscope</b> items.
     * @throws MicrodataParserException if
     *         <code>errorMode == {@link org.apache.any23.extractor.microdata.MicrodataParser.ErrorMode#STOP_AT_FIRST_ERROR}</code>
     *         and an error occurs.
     */
    public static MicrodataParserReport getMicrodata(Document document, ErrorMode errorMode)
            throws MicrodataParserException {
        final List<Node> itemNodes = getTopLevelItemScopeNodes(document);
        final List<ItemScope> items = new ArrayList<>();
        final MicrodataParser microdataParser = new MicrodataParser(document);
        microdataParser.setErrorMode(errorMode);
        for (Node itemNode : itemNodes) {
            items.add(microdataParser.getItemScope(itemNode));
        }
        return new MicrodataParserReport(items.toArray(new ItemScope[items.size()]), microdataParser.getErrors());
    }

    /**
     * Returns all the <b>Microdata items</b> detected within the given <code>document</code>,
     * works in full report mode.
     *
     * @param document document to be processed.
     * @return list of <b>itemscope</b> items.
     */
    public static MicrodataParserReport getMicrodata(Document document) {
        try {
            return getMicrodata(document, ErrorMode.FULL_REPORT);
        } catch (MicrodataParserException mpe) {
            throw new IllegalStateException("Unexpected exception.", mpe);
        }
    }

    /**
     * Returns a <i>JSON</i> containing the list of all extracted Microdata,
     * as described at <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>.
     *
     * @param document document to be processed.
     * @param ps the {@link java.io.PrintStream} to write JSON to
     */
    public static void getMicrodataAsJSON(Document document, PrintStream ps) {
        final MicrodataParserReport report = getMicrodata(document);
        final ItemScope[] itemScopes = report.getDetectedItemScopes();
        final MicrodataParserException[] errors = report.getErrors();

        ps.append("{ ");

        // Results.
        ps.append("\"result\" : [");
        for (int i = 0; i < itemScopes.length; i++) {
            if (i > 0) {
                ps.print(", ");
            }
            ps.print(itemScopes[i].toJSON());
        }
        ps.append("] ");

        // Errors.
        if (errors != null && errors.length > 0) {
            ps.append(", ");
            ps.append("\"errors\" : [");
            for (int i = 0; i < errors.length; i++) {
                if (i > 0) {
                    ps.print(", ");
                }
                ps.print(errors[i].toJSON());
            }
            ps.append("] ");
        }

        ps.append("}");
    }

    /**
     * Returns only nodes that are <b>not</b> nested one each other.
     *
     * @param candidates list of candidate nodes.
     * @return list of unnested nodes.
     */
    @SuppressWarnings("unused")
    private static List<Node> getUnnestedNodes(List<Node> candidates) {
        final List<Node> unnesteds = new ArrayList<>();
        for (int i = 0; i < candidates.size(); i++) {
            boolean skip = false;
            for (int j = 0; j < candidates.size(); j++) {
                if (i == j)
                    continue;
                if (DomUtils.isAncestorOf(candidates.get(j), candidates.get(i), true)) {
                    skip = true;
                    break;
                }
            }
            if (!skip) {
                unnesteds.add(candidates.get(i));
            }
        }
        return unnesteds;
    }

    public void setErrorMode(ErrorMode errorMode) {
        if (errorMode == null)
            throw new IllegalArgumentException("errorMode must be not null.");
        this.errorMode = errorMode;
    }

    public ErrorMode getErrorMode() {
        return this.errorMode;
    }

    public MicrodataParserException[] getErrors() {
        return errors == null ? new MicrodataParserException[0]
                : errors.toArray(new MicrodataParserException[errors.size()]);
    }

    /**
     * Reads the value of a <b>itemprop</b> node.
     *
     * @param node itemprop node.
     * @return value detected within the given <code>node</code>.
     * @throws MicrodataParserException if an error occurs while extracting a nested item scope.
     */
    public ItemPropValue getPropertyValue(Node node) throws MicrodataParserException {
        final ItemPropValue itemPropValue = itemPropValues.get(node);
        if (itemPropValue != null)
            return itemPropValue;

        final String nodeName = node.getNodeName().toLowerCase();
        if (DomUtils.hasAttribute(node, "content")) {
            return new ItemPropValue(DomUtils.readAttribute(node, "content"), ItemPropValue.Type.Plain);
        }

        if (SRC_TAGS.contains(nodeName)) {
            return new ItemPropValue(DomUtils.readAttribute(node, "src"), ItemPropValue.Type.Link);
        }
        if (HREF_TAGS.contains(nodeName)) {
            return new ItemPropValue(DomUtils.readAttribute(node, "href"), ItemPropValue.Type.Link);
        }

        if ("object".equals(nodeName)) {
            return new ItemPropValue(DomUtils.readAttribute(node, "data"), ItemPropValue.Type.Link);
        }
        if ("time".equals(nodeName)) {
            final String dateTimeStr = DomUtils.readAttribute(node, "datetime");
            final Date dateTime;
            try {
                dateTime = ItemPropValue.parseDateTime(dateTimeStr);
            } catch (ParseException pe) {
                throw new MicrodataParserException(String.format("Invalid format for datetime '%s'", dateTimeStr),
                        node);
            }
            return new ItemPropValue(dateTime, ItemPropValue.Type.Date);
        }

        if (isItemScope(node)) {
            return new ItemPropValue(getItemScope(node), ItemPropValue.Type.Nested);
        }

        final ItemPropValue newItemPropValue = new ItemPropValue(node.getTextContent(), ItemPropValue.Type.Plain);
        itemPropValues.put(node, newItemPropValue);
        return newItemPropValue;
    }

    /**
     * Returns all the <b>itemprop</b>s for the given <b>itemscope</b> node.
     *
     * @param scopeNode node representing the <b>itemscope</b>
     * @param skipRoot if <code>true</code> the given root <code>node</code>
     *        will be not read as a property, even if it contains the <b>itemprop</b> attribute.
     * @return the list of <b>itemprop</b>s detected within the given <b>itemscope</b>.
     * @throws MicrodataParserException if an error occurs while retrieving an property value.
     */
    public List<ItemProp> getItemProps(final Node scopeNode, boolean skipRoot) throws MicrodataParserException {
        final Set<Node> accepted = new LinkedHashSet<>();

        if (!skipRoot) {
            NamedNodeMap attributes = scopeNode.getAttributes();
            if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null) {
                accepted.add(scopeNode);
            }
        }

        // TreeWalker to walk DOM tree starting with the scopeNode. Nodes maybe visited multiple times.
        TreeWalker treeWalker = ((DocumentTraversal) scopeNode.getOwnerDocument()).createTreeWalker(scopeNode,
                NodeFilter.SHOW_ELEMENT, new NodeFilter() {
                    @Override
                    public short acceptNode(Node node) {
                        if (node.getNodeType() == Node.ELEMENT_NODE) {
                            NamedNodeMap attributes = node.getAttributes();
                            if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null && !scopeNode.equals(node)) {
                                accepted.add(node);
                            }

                            if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
                                // Don't visit descendants of nodes that define a new scope
                                return FILTER_REJECT;
                            }
                        }
                        return FILTER_ACCEPT;
                    }
                }, false);

        // To populate accepted we only need to walk the tree.
        while (treeWalker.nextNode() != null)
            ;

        final List<ItemProp> result = new ArrayList<>();
        for (Node itemPropNode : accepted) {
            final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE, null);

            if (StringUtils.isBlank(itemProp)) {
                manageError(new MicrodataParserException("invalid property name '" + itemProp + "'", itemPropNode));
                continue;
            }

            final String[] propertyNames = itemProp.trim().split("\\s+");
            ItemPropValue itemPropValue;
            for (String propertyName : propertyNames) {
                try {
                    itemPropValue = getPropertyValue(itemPropNode);
                } catch (MicrodataParserException mpe) {
                    manageError(mpe);
                    continue;
                }
                result.add(new ItemProp(DomUtils.getXPathForNode(itemPropNode), propertyName, itemPropValue));
            }
        }
        return result;
    }

    /**
     * Given a document and a list of <b>itemprop</b> names this method will return
     * such <b>itemprops</b>.
     * 
     * @param node a {@link org.w3c.dom.Node} to which the refs belong
     * @param refs list of references.
     * @return list of retrieved <b>itemprop</b>s.
     * @throws MicrodataParserException if a loop is detected or a property name is missing.
     */
    public ItemProp[] deferProperties(Node node, String... refs) throws MicrodataParserException {
        dereferenceRecursionCounter++;
        final List<ItemProp> result = new ArrayList<>();
        try {
            for (String ref : refs) {
                if (loopDetectorSet.contains(ref)) {
                    throw new MicrodataParserException(
                            String.format("Loop detected with depth %d while dereferencing itemProp '%s' .",
                                    dereferenceRecursionCounter - 1, ref),
                            null);
                }
                loopDetectorSet.add(ref);
                Element element = (Element) node;
                if (element == null) {
                    manageError(new MicrodataParserException(String.format("Unknown itemProp id '%s'", ref), null));
                    continue;
                }
                List<ItemProp> propList = getItemProps(element, false);
                if (!result.containsAll(propList)) {
                    result.addAll(propList);
                }
            }
        } catch (MicrodataParserException mpe) {
            if (dereferenceRecursionCounter == 1)
                manageError(mpe);
            else
                throw mpe; // Recursion end, this the the top call.
        } finally {
            dereferenceRecursionCounter--;
            if (dereferenceRecursionCounter == 0) { // Recursion end, this the the top call.
                loopDetectorSet.clear();
            }
        }
        return result.toArray(new ItemProp[result.size()]);
    }

    /**
     * Returns the {@link ItemScope} instance described within the specified <code>node</code>.
     *
     * @param node node describing an <i>itemscope</i>.
     * @return instance of ItemScope object.
     * @throws MicrodataParserException if an error occurs while dereferencing properties.
     */
    public ItemScope getItemScope(Node node) throws MicrodataParserException {
        final ItemScope itemScope = itemScopes.get(node);
        if (itemScope != null)
            return itemScope;

        final String id = DomUtils.readAttribute(node, "id", null);
        final String itemref = DomUtils.readAttribute(node, "itemref", null);
        final String itemType = DomUtils.readAttribute(node, "itemtype", null);
        final String itemId = DomUtils.readAttribute(node, "itemid", null);

        final List<ItemProp> itemProps = getItemProps(node, true);
        final String[] itemrefIDs = itemref == null ? new String[0] : itemref.split(" ");
        final ItemProp[] deferredProperties;
        try {
            deferredProperties = deferProperties(node, itemrefIDs);
        } catch (MicrodataParserException mpe) {
            mpe.setErrorNode(node);
            throw mpe;
        }
        for (ItemProp deferredProperty : deferredProperties) {
            if (itemProps.contains(deferredProperty)) {
                manageError(new MicrodataParserException(
                        String.format("Duplicated deferred itemProp '%s'.", deferredProperty.getName()), node));
                continue;
            }
            itemProps.add(deferredProperty);
        }

        URL type;
        try {
            type = ItemScope.stringToUrl(itemType);
        } catch (IllegalArgumentException e) {
            manageError(new MicrodataParserException(e.getMessage(), node));
            type = null;
        }

        final ItemScope newItemScope = new ItemScope(DomUtils.getXPathForNode(node),
                itemProps.toArray(new ItemProp[itemProps.size()]), id, itemrefIDs, type, itemId);
        itemScopes.put(node, newItemScope);
        return newItemScope;
    }

    private void manageError(MicrodataParserException mpe) throws MicrodataParserException {
        switch (errorMode) {
        case FULL_REPORT:
            errors.add(mpe);
            break;
        case STOP_AT_FIRST_ERROR:
            throw mpe;
        default:
            throw new IllegalStateException("Unsupported mode " + errorMode);
        }
    }

}