org.chromium.distiller.PageParameterParser.java Source code

Introduction

Here is the source code for org.chromium.distiller.PageParameterParser.java
Source

// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

package org.chromium.distiller;

import org.chromium.distiller.proto.DomDistillerProtos;
import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;

import com.google.gwt.dom.client.AnchorElement;
import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
import com.google.gwt.dom.client.Node;
import com.google.gwt.dom.client.NodeList;
import com.google.gwt.dom.client.Style;
import com.google.gwt.regexp.shared.MatchResult;
import com.google.gwt.regexp.shared.RegExp;

/**
 * Background:
 *   The long article/news/forum thread/blog document may be partitioned into several partial pages
 *   by webmaster.  Each partial page has outlinks pointing to the adjacent partial pages.  The
 *   anchor text of those outlinks is numeric.
 *
 * This class parses the document to collect groups of adjacent plain text numbers and outlinks with
 * digital anchor text.  These are then passed to PageParameterParser which would spit out the
 * pagination URLs if available.
 */
public class PageParameterParser {
    // If the numeric value of a link's anchor text is greater than this number, we don't think it
    // represents the page number of the link.
    private static final int MAX_NUM_FOR_PAGE_PARAM = 100;

    /**
     * Stores PageParamInfo.PageInfo and the anchor's text, specifically returned by
     * getPageInfoAndText().
     */
    private static class PageInfoAndText {
        private final PageParamInfo.PageInfo mPageInfo;
        private final String mText;

        PageInfoAndText(int number, String url, String text) {
            mPageInfo = new PageParamInfo.PageInfo(number, url);
            mText = text;
        }
    }

    /**
     * Entry point for PageParameterParser.
     * Parses the document to collect outlinks with numeric anchor text and numeric text around
     * them.  These are then passed to PageParameterParser to detect pagination URLs.
     *
     * @return PageParamInfo (see PageParamInfo.java), always.  If no page parameter is detected or
     * determined to be best, its mType is PageParamInfo.Type.UNSET.
     *
     * @param originalUrl the original URL of the document to be parsed.
     * @param timingInfo for tracking performance.
     */
    public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {
        PageParameterParser parser = new PageParameterParser(timingInfo);
        return parser.parseDocument(Document.get().getDocumentElement(), originalUrl);
    }

    private final TimingInfo mTimingInfo;
    private String mDocUrl = "";
    private ParsedUrl mParsedUrl = null;
    private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups();
    private int mNumForwardLinksProcessed = 0;

    private static RegExp sHrefCleaner = RegExp.compile("\\/$");
    private static RegExp sInvalidParentWrapper = null;

    private PageParameterParser(TimingInfo timingInfo) {
        mTimingInfo = timingInfo;
    }

    /**
     * Actually implements PageParameterParser.parse(), see above description for parse().
     */
    private PageParamInfo parseDocument(Element root, String originalUrl) {
        double startTime = DomUtil.getTime();

        mDocUrl = sHrefCleaner.replace(originalUrl, "");
        mParsedUrl = ParsedUrl.create(mDocUrl);
        if (mParsedUrl == null)
            return new PageParamInfo(); // Invalid document URL.

        AnchorElement baseAnchor = PagingLinksFinder
                .createAnchorWithBase(PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));

        NodeList<Element> allLinks = root.getElementsByTagName("A");
        int idx = 0;
        while (idx < allLinks.getLength()) {
            final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));
            PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
            if (pageInfoAndText == null) {
                idx++;
                continue;
            }

            // This link is a good candidate for pagination.

            // Close current group of adjacent numbers, add a new group if necessary.
            mAdjacentNumbersGroups.addGroup();

            // Before we append the link to the new group of adjacent numbers, check if it's
            // preceded by a text node with numeric text; if so, add it before the link.
            findAndAddClosestValidLeafNodes(link, false, true, null);

            // Add the link to the current group of adjacent numbers.
            mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);

            // Add all following text nodes and links with numeric text.
            mNumForwardLinksProcessed = 0;
            findAndAddClosestValidLeafNodes(link, false, false, baseAnchor);

            // Skip the current link and links already processed in the forward
            // findandAddClosestValidLeafNodes().
            idx += 1 + mNumForwardLinksProcessed;
        } // while there're links.

        mAdjacentNumbersGroups.cleanup();

        LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");

        startTime = DomUtil.getTime();
        PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl);
        LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
        return info;
    }

    /**
     * @return a populated PageInfoAndText if given link is to be added to mAdjacentNumbersGroups.
     * Otherwise, returns null if link is to be ignored.
     * "javascript:" links with numeric text are considered valid links to be added.
     *
     * @param link to process.
     * @param baseAnchor created for the current document.
     */
    private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) {
        // Ignore invisible links.
        if (!DomUtil.isVisible(link))
            return null;

        // Use javascript innerText (instead of javascript textContent) to only get visible text.
        String linkText = StringUtil.jsTrim(DomUtil.getInnerText(link));
        int number = linkTextToNumber(linkText);
        if (!isPlainPageNumber(number))
            return null;

        String linkHref = resolveLinkHref(link, baseAnchor);
        final boolean isEmptyHref = linkHref.isEmpty();
        boolean isJavascriptLink = false;
        ParsedUrl url = null;
        if (!isEmptyHref) {
            isJavascriptLink = isJavascriptHref(linkHref);
            url = ParsedUrl.create(linkHref);
            if (url == null || (!isJavascriptLink && !url.getHost().equalsIgnoreCase(mParsedUrl.getHost()))) {
                return null;
            }
            url.setHash("");
        }

        if (isEmptyHref || isJavascriptLink || isDisabledLink(link)) {
            return new PageInfoAndText(number, "", linkText);
        }

        return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText);
    }

    /**
     * Finds and adds the leaf node(s) closest to the given start node.
     * This recurses and keeps finding and, if necessary, adding the numeric text of valid nodes,
     * collecting the PageParamInfo.PageInfo's for the current adjacency group.
     * For backward search, i.e. nodes before start node, search terminates (i.e. recursion stops)
     * once a text node or anchor is encountered.  If the text node contains numeric text, it's
     * added to the current adjacency group.  Otherwise, a new group is created to break the
     * adjacency.
     * For forward search, i.e. nodes after start node, search continues (i.e. recursion continues)
     * until a text node or anchor with non-numeric text is encountered.  In the process, text nodes
     * and anchors with numeric text are added to the current adjacency group.  When a non-numeric
     * text node or anchor is encountered, a new group is started to break the adjacency, and search
     * ends.
     *
     * @return true to continue search, false to stop.
     *
     * @param start node to work on.
     * @param checkStart true to check start node.  Otherwise, the previous or next sibling of the
     * start node is checked.
     * @param backward true to search backward (i.e. nodes before start node), false to search
     * forward (i.e. nodes after start node).
     * @param baseAnchor created for the current document, only needed for forward search.
     */
    private boolean findAndAddClosestValidLeafNodes(Node start, boolean checkStart, boolean backward,
            AnchorElement baseAnchor) {
        Node node = checkStart ? start : (backward ? start.getPreviousSibling() : start.getNextSibling());
        if (node == null) { // No sibling, try parent.
            node = start.getParentNode();
            if (sInvalidParentWrapper == null) {
                sInvalidParentWrapper = RegExp.compile("(BODY)|(HTML)");
            }
            if (sInvalidParentWrapper.test(node.getNodeName()))
                return false;
            return findAndAddClosestValidLeafNodes(node, false, backward, baseAnchor);
        }

        checkStart = false;
        switch (node.getNodeType()) {
        case Node.TEXT_NODE:
            String text = node.getNodeValue();
            // Text must contain words.
            if (text.isEmpty() || StringUtil.countWords(text) == 0)
                break;
            boolean added = addNonLinkTextIfValid(node.getNodeValue());
            // For backward search, we're done regardless if text was added.
            // For forward search, we're done only if text was invalid, otherwise continue.
            if (backward || !added)
                return false;
            break;

        case Node.ELEMENT_NODE:
            Element e = Element.as(node);
            if (e.hasTagName("A")) {
                // For backward search, we're done because we've already processed the anchor.
                if (backward)
                    return false;
                // For forward search, we're done only if link was invalid, otherwise continue.
                mNumForwardLinksProcessed++;
                if (!addLinkIfValid(AnchorElement.as(e), baseAnchor))
                    return false;
                break;
            }
            // Intentionally fall through.

        default:
            // Check children nodes.
            if (!node.hasChildNodes())
                break;
            checkStart = true; // We want to check the child node.
            if (backward) {
                // Start the backward search with the rightmost child i.e. last and closest to
                // given node.
                node = node.getLastChild();
            } else {
                // Start the forward search with the leftmost child i.e. first and closest to
                // given node.
                node = node.getFirstChild();
            }
            break;
        }

        return findAndAddClosestValidLeafNodes(node, checkStart, backward, baseAnchor);
    }

    private static RegExp sTermsRegExp = null; // Match terms i.e. words.
    private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits.

    /**
     * Handle the text for a non-link node.  Each numeric term in the text that is a valid plain
     * page number adds a PageParamInfo.PageInfo into the current adjacent group.  All other terms
     * break the adjacency in the current group, adding a new group instead.
     *
     * @Return true if text was added to current group of adjacent numbers.  Otherwise, false with
     * a new group created to break the current adjacency.
     */
    private boolean addNonLinkTextIfValid(String text) {
        if (!StringUtil.containsDigit(text)) {
            // The text does not contain valid number(s); if necessary, current group of adjacent
            // numbers should be closed, adding a new group if possible.
            mAdjacentNumbersGroups.addGroup();
            return false;
        }

        if (sTermsRegExp == null) {
            sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi");
        } else {
            sTermsRegExp.setLastIndex(0);
        }
        if (sSurroundingDigitsRegExp == null) {
            sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");
        }

        // Extract terms from the text, differentiating between those that contain only digits and
        // those that contain non-digits.
        boolean added = false;
        while (true) {
            MatchResult match = sTermsRegExp.exec(text);
            if (match == null)
                break;
            if (match.getGroupCount() <= 1)
                continue;

            String term = match.getGroup(1);
            MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);
            int number = -1;
            if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {
                number = StringUtil.toNumber(termWithDigits.getGroup(1));
            }
            if (isPlainPageNumber(number)) {
                // This text is a valid candidate of plain text page number, add it to last group of
                // adjacent numbers.
                mAdjacentNumbersGroups.addNumber(number, "");
                added = true;
            } else {
                // The text is not a valid number, so current group of adjacent numbers should be
                // closed, adding a new group if possible.
                mAdjacentNumbersGroups.addGroup();
            }
        } // while there're matches

        return added;
    }

    /**
     * Adds PageParamInfo.PageInfo to the current adjacent group for a link if its text is numeric.
     * Otherwise, add a new group to break the adjacency.
     *
     * @Return true if link was added, false otherwise.
     */
    private boolean addLinkIfValid(AnchorElement link, AnchorElement baseAnchor) {
        PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
        if (pageInfoAndText != null) {
            mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);
            return true;
        }
        mAdjacentNumbersGroups.addGroup();
        return false;
    }

    /**
     * @return true if link is disabled i.e. not clickable because it has a text cursor.
     */
    private static boolean isDisabledLink(AnchorElement link) {
        Style style = DomUtil.getComputedStyle(link);
        return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT;
    }

    /**
     * @return true if href starts with "javascript:".
     */
    private static boolean isJavascriptHref(String href) {
        return href.startsWith("javascript:");
    }

    private static String resolveLinkHref(AnchorElement link, AnchorElement baseAnchor) {
        // Anchors without "href" attribute are not considered potential pagination links.
        String linkHref = link.getAttribute("href");
        if (linkHref.isEmpty())
            return "";
        baseAnchor.setAttribute("href", linkHref);
        return baseAnchor.getHref();
    }

    private static int linkTextToNumber(String linkText) {
        linkText = linkText.replaceAll("[()\\[\\]{}]", "");
        linkText = linkText.trim(); // Remove leading and trailing white spaces.
        return StringUtil.toNumber(linkText);
    }

    /**
     * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.
     */
    private static boolean isPlainPageNumber(int number) {
        return number >= 0 && number <= MAX_NUM_FOR_PAGE_PARAM;
    }

}