org.chromium.distiller.ContentExtractor.java Source code

Introduction

Here is the source code for org.chromium.distiller.ContentExtractor.java
Source

// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

package org.chromium.distiller;

import org.chromium.distiller.document.TextDocument;
import org.chromium.distiller.document.TextDocumentStatistics;
import org.chromium.distiller.extractors.ArticleExtractor;
import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
import org.chromium.distiller.proto.DomDistillerProtos.TimingEntry;
import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
import org.chromium.distiller.webdocument.DomConverter;
import org.chromium.distiller.webdocument.WebDocument;
import org.chromium.distiller.webdocument.WebDocumentBuilder;
import org.chromium.distiller.webdocument.WebImage;
import org.chromium.distiller.webdocument.filters.RelevantElements;
import org.chromium.distiller.webdocument.filters.LeadImageFinder;
import org.chromium.distiller.webdocument.filters.NestedElementRetainer;

import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
import com.google.gwt.dom.client.Node;
import com.google.gwt.dom.client.NodeList;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

public class ContentExtractor {
    private final Element documentElement;
    private final List<String> candidateTitles;
    private final TimingInfo mTimingInfo;
    private final StatisticsInfo mStatisticsInfo;
    private final MarkupParser parser;
    private final List<String> imageUrls;
    private String textDirection;

    private class WebDocumentInfo {
        WebDocument document;
        Set<Node> hiddenElements;
    }

    public ContentExtractor(Element root) {
        documentElement = root;
        candidateTitles = new LinkedList<String>();
        mTimingInfo = TimingInfo.create();
        mStatisticsInfo = StatisticsInfo.create();
        imageUrls = new ArrayList<String>();

        double startTime = DomUtil.getTime();
        parser = new MarkupParser(root, mTimingInfo);
        mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);
        textDirection = "";
    }

    // Grabs a list of candidate titles in descending priority order:
    // 1) meta-information
    // 2) The document's title element, modified based on some readability heuristics
    // 3) The document's title element, if it's a string
    private void ensureTitleInitialized() {
        if (candidateTitles.size() > 0)
            return;

        String title = parser.getTitle();
        if (!title.isEmpty()) {
            candidateTitles.add(title);
        }
        candidateTitles.add(DocumentTitleGetter.getDocumentTitle(Document.get().getTitle(),
                Document.get().getDocumentElement()));
        if (Document.get().getTitle().getClass() == String.class) {
            candidateTitles.add(Document.get().getTitle());
        }
    }

    public MarkupParser getMarkupParser() {
        return parser;
    }

    public String extractTitle() {
        ensureTitleInitialized();
        assert candidateTitles.size() > 0;
        return candidateTitles.get(0);
    }

    public String extractContent() {
        return extractContent(false);
    }

    public String extractContent(boolean textOnly) {
        double now = DomUtil.getTime();
        WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();
        mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);

        now = DomUtil.getTime();
        processDocument(documentInfo.document);
        RelevantElements.process(documentInfo.document);
        LeadImageFinder.process(documentInfo.document);
        NestedElementRetainer.process(documentInfo.document);

        List<WebImage> images = documentInfo.document.getContentImages();
        for (WebImage wi : images) {
            imageUrls.add(wi.getSrc());
        }
        mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);

        now = DomUtil.getTime();
        String html = documentInfo.document.generateOutput(textOnly);
        mTimingInfo.setFormattingTime(DomUtil.getTime() - now);

        if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) {
            for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) {
                TimingEntry entry = mTimingInfo.getOtherTimes(i);
                LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entry.getTime());
            }

            LogUtil.logToConsole("Timing: MarkupParsingTime = " + mTimingInfo.getMarkupParsingTime()
                    + "\nTiming: DocumentConstructionTime = " + mTimingInfo.getDocumentConstructionTime()
                    + "\nTiming: ArticleProcessingTime = " + mTimingInfo.getArticleProcessingTime()
                    + "\nTiming: FormattingTime = " + mTimingInfo.getFormattingTime());
        }
        return html;
    }

    /**
     * Returns timing information about the most recent extraction run.
     * @return an instance of DomDistillerProtos.TimingInfo with detailed timing statistics.
     */
    public TimingInfo getTimingInfo() {
        return mTimingInfo;
    }

    /**
     * Returns statistical information about the most recent extraction run.
     * @return an instance of DomDistillerProtos.StatisticsInfo with detailed statistics.
     */
    public StatisticsInfo getStatisticsInfo() {
        return mStatisticsInfo;
    }

    /**
     * Get the page's text directionality ("ltr", "rtl", or "auto").
     * @return The page's text direction (default is "auto").
     */
    public String getTextDirection() {
        if (textDirection == null || textDirection.isEmpty()) {
            textDirection = "auto";
        }
        return textDirection;
    }

    /**
     * Get a list of the content image URLs in the provided document.
     * @return A list of image URLs.
     */
    public List<String> getImageUrls() {
        return imageUrls;
    }

    /**
     * Get the element of the main article, if any.
     * @return An element of article (not necessarily the html5 article element).
     */
    private Element getArticleElement(Element root) {
        NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
        // Having multiple article elements usually indicates a bad case for this shortcut.
        // TODO(wychen): some sites exclude things like title and author in article element.
        if (allArticles.getLength() == 1) {
            return allArticles.getItem(0);
        }
        // Note that the CSS property matching is case sensitive, and "Article" is the correct
        // capitalization.
        String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
        allArticles = DomUtil.querySelectorAll(root, query);
        // It is commonly seen that the article is wrapped separately or in multiple layers.
        if (allArticles.getLength() > 0) {
            return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
        }
        return null;
    }

    /**
     * Converts the original HTML page into a WebDocument for analysis.
     */
    private WebDocumentInfo createWebDocumentInfoFromPage() {
        WebDocumentInfo info = new WebDocumentInfo();
        WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
        DomConverter converter = new DomConverter(documentBuilder);
        Element walkerRoot = getArticleElement(documentElement);
        if (walkerRoot == null) {
            walkerRoot = documentElement;
        }
        new DomWalker(converter).walk(walkerRoot);
        info.document = documentBuilder.toWebDocument();
        ensureTitleInitialized();
        info.hiddenElements = converter.getHiddenElements();

        return info;
    }

    /**
     * Implements the actual analysis of the page content, identifying the core elements of the
     * page.
     *
     * @param document the WebDocument representation of the page extracted from the DOM.
     */
    private void processDocument(WebDocument document) {
        TextDocument textDocument = document.createTextDocumentView();
        ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
        mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(textDocument));
        textDocument.applyToModel();
    }
}