com.dom_distiller.client.IEReadingViewParser.java Source code

Introduction

Here is the source code for com.dom_distiller.client.IEReadingViewParser.java
Source

// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

package com.dom_distiller.client;

import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;

import com.google.gwt.dom.client.Element;
import com.google.gwt.dom.client.ImageElement;
import com.google.gwt.dom.client.MetaElement;
import com.google.gwt.dom.client.Node;
import com.google.gwt.dom.client.NodeList;
import com.google.gwt.dom.client.Text;

/**
 * This class recognizes and parses the IE Reading View markup tags according to
 * http://ie.microsoft.com/testdrive/browser/readingview, and returns the properties that matter to
 * distilled content - title, date, author, publisher, dominant image, inline images, caption,
 * copyright, and opt-out.  Some properties require the scanning and parsing of a lot of nodes, so
 * each property is scanned for and verified for legitimacy lazily, i.e. only upon request.
 */
public class IEReadingViewParser implements MarkupParser.Accessor {
    private Element mRoot = null;
    private NodeList<Element> mAllMeta = null;
    // The following data members are initialized to null to indicate that they haven't been
    // determined.  Once determined, they'll either have legitmate or empty values.
    private String mTitle = null;
    private String mDate = null;
    private String mAuthor = null;
    private String mPublisher = null;
    private String mCopyright = null;
    private boolean mOptOut = false;
    private boolean mDoneOptOut = false; // Set to true if mOptOut has been determined.
    private List<MarkupParser.Image> mImages = null;

    /**
     * The object that will extract and verify IE Reading View markup tags from |root| upon request.
     */
    public IEReadingViewParser(Element root) {
        mRoot = root;
        mAllMeta = root.getElementsByTagName("META");
    }

    @Override
    public String getTitle() {
        if (mTitle == null)
            findTitle();
        return mTitle;
    }

    public String getType() {
        return ""; // Not supported.
    }

    @Override
    public String getUrl() {
        return ""; // Not supported.
    }

    @Override
    public MarkupParser.Image[] getImages() {
        if (mImages == null)
            findImages();
        return mImages.toArray(new MarkupParser.Image[mImages.size()]);
    }

    @Override
    public String getDescription() {
        return ""; // Not supported.
    }

    @Override
    public String getPublisher() {
        // The primary indicator is the OpenGraph Protocol "site_name" meta tag, which is handled by
        // OpenGraphProtocolParser.  The secondary indicator is any html tag with the "publisher" or
        // "source_organization" attribute.
        if (mPublisher == null)
            findPublisher();
        return mPublisher;
    }

    @Override
    public String getCopyright() {
        if (mCopyright == null)
            findCopyright();
        return mCopyright;
    }

    @Override
    public String getAuthor() {
        if (mAuthor == null)
            findAuthor();
        return mAuthor;
    }

    @Override
    public MarkupParser.Article getArticle() {
        MarkupParser.Article article = new MarkupParser.Article();
        if (mDate == null)
            findDate();
        article.publishedTime = mDate;
        String author = getAuthor();
        article.authors = author.isEmpty() ? new String[0] : new String[] { author };
        return article;
    }

    @Override
    public boolean optOut() {
        if (!mDoneOptOut)
            findOptOut();
        return mOptOut;
    }

    private void findTitle() {
        mTitle = "";

        if (mAllMeta.getLength() == 0)
            return;

        // Make sure there's a <title> element.
        NodeList<Element> titles = mRoot.getElementsByTagName("TITLE");
        if (titles.getLength() == 0)
            return;

        // Extract title text from meta tag with "title" as name.
        for (int i = 0; i < mAllMeta.getLength(); i++) {
            MetaElement meta = MetaElement.as(mAllMeta.getItem(i));
            if (meta.getName().equalsIgnoreCase("title")) {
                mTitle = meta.getContent();
                break;
            }
        }
    }

    private void findDate() {
        mDate = "";

        // Get date from any element that includes the "dateline" class.
        Element elem = DomUtil.getFirstElementWithClassName(mRoot, "dateline");
        if (elem != null) {
            // Use javascript textContent (instead of javascript innerText) to include invisible
            // text.
            mDate = DomUtil.javascriptTextContent(elem);
        } else { // Otherwise, get date from meta tag with "displaydate" as name.
            for (int i = 0; i < mAllMeta.getLength(); i++) {
                MetaElement meta = MetaElement.as(mAllMeta.getItem(i));
                if (meta.getName().equalsIgnoreCase("displaydate")) {
                    mDate = meta.getContent();
                    break;
                }
            }
        }
    }

    private void findAuthor() {
        mAuthor = "";

        // Get author from the first element that includes the "byline-name" class.
        // Note that we ignore the order of this element for now.
        Element elem = DomUtil.getFirstElementWithClassName(mRoot, "byline-name");
        // Use javascript textContent (instead of javascript innerText) to include invisible text.
        if (elem != null)
            mAuthor = DomUtil.javascriptTextContent(elem);
    }

    private void findPublisher() {
        mPublisher = "";

        // Look for "publisher" or "source_organization" attribute in any html tag.
        NodeList<Element> allElems = mRoot.getElementsByTagName("*");
        for (int i = 0; i < allElems.getLength() && mPublisher.isEmpty(); i++) {
            Element e = allElems.getItem(i);
            mPublisher = e.getAttribute("publisher");
            if (mPublisher.isEmpty())
                mPublisher = e.getAttribute("source_organization");
        }
    }

    private void findCopyright() {
        mCopyright = "";

        // Get copyright from meta tag with "copyright" as name.
        for (int i = 0; i < mAllMeta.getLength(); i++) {
            MetaElement meta = MetaElement.as(mAllMeta.getItem(i));
            if (meta.getName().equalsIgnoreCase("copyright")) {
                mCopyright = meta.getContent();
                break;
            }
        }
    }

    private void findImages() {
        mImages = new ArrayList<MarkupParser.Image>();

        NodeList<Element> allImages = mRoot.getElementsByTagName("IMG");
        for (int i = 0; i < allImages.getLength(); i++) {
            ImageElement imgElem = ImageElement.as(allImages.getItem(i));

            // As long as the image has a caption, it's relevant regardless of size;
            // otherwise, it's relevant if its size is good.
            String caption = getCaption(imgElem);
            if ((caption != null && !caption.isEmpty()) || isImageRelevantBySize(imgElem)) {
                // Add relevant image to list.
                MarkupParser.Image image = new MarkupParser.Image();
                image.url = imgElem.getSrc();
                image.caption = caption;
                image.width = imgElem.getWidth();
                image.height = imgElem.getHeight();
                mImages.add(image);
            }
        }
    }

    private void findOptOut() {
        mDoneOptOut = true;

        // Get optout from meta tag with "IE_RM_OFF" as name.
        for (int i = 0; i < mAllMeta.getLength(); i++) {
            MetaElement meta = MetaElement.as(mAllMeta.getItem(i));
            if (meta.getName().equalsIgnoreCase("IE_RM_OFF")) {
                mOptOut = meta.getContent().equalsIgnoreCase("true");
                break;
            }
        }
    }

    private static boolean isImageRelevantBySize(ImageElement image) {
        // Relevant image size: width >= 400 and aspect ratio between 1.3 and 3.0 inclusively.
        int width = image.getWidth();
        if (width < 400)
            return false;
        double aspectRatio = (double) width / (double) image.getHeight();
        return aspectRatio >= 1.3 && aspectRatio <= 3.0;
    }

    private static String getCaption(ImageElement image) {
        // If |image| is a child of <figure>, then get the <figcaption> elements.
        Element parent = image.getParentElement();
        if (!parent.hasTagName("FIGURE"))
            return "";
        NodeList<Element> captions = parent.getElementsByTagName("FIGCAPTION");
        int numCaptions = captions.getLength();
        String caption = "";
        if (numCaptions > 0 && numCaptions <= 2) {
            // Use javascript innerText (instead of javascript textContent) to get only visible
            // captions.
            for (int i = 0; i < numCaptions && caption.isEmpty(); i++) {
                caption = DomUtil.getInnerText(captions.getItem(i));
            }
        }
        return caption;
    }

    private static boolean isTextInBody(Element root, String text) {
        String lowerText = text.toLowerCase();
        NodeList<Element> bodies = root.getElementsByTagName("BODY");
        for (int i = 0; i < bodies.getLength(); i++) {
            // Use javascript textContent (instead of javascript innerText) to include invisible
            // text.
            if (DomUtil.javascriptTextContent(bodies.getItem(i)).toLowerCase().contains(lowerText)) {
                return true;
            }
        }
        return false;
    }
}