org.chromium.distiller.DomUtil.java Source code

Introduction

Here is the source code for org.chromium.distiller.DomUtil.java
Source

// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

package org.chromium.distiller;

import com.google.gwt.core.client.JsArray;
import com.google.gwt.core.client.JsArrayString;
import com.google.gwt.dom.client.AnchorElement;
import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
import com.google.gwt.dom.client.ImageElement;
import com.google.gwt.dom.client.Node;
import com.google.gwt.dom.client.NodeList;
import com.google.gwt.dom.client.Style;
import com.google.gwt.dom.client.VideoElement;
import com.google.gwt.http.client.URL;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class DomUtil {
    /**
     * GWT does not provide a way to get a list of all attributes that have been explicitly set on a
     * DOM element (only a way to query the value of a particular attribute). In javascript, this
     * list is accessible as elem.attributes.
     *
     * @Return The element's attribute list from javascript.
     */
    public static native JsArray<Node> getAttributes(Element elem) /*-{
                                                                   return elem.attributes;
                                                                   }-*/;

    // Returns the first element with |className| in the tree rooted at |root|, null if none is
    // found.
    public static native Element getFirstElementWithClassName(Element root, String className) /*-{
                                                                                              return root.querySelector("." + className);
                                                                                              }-*/;

    public static native boolean hasClassName(Element elem, String className) /*-{
                                                                              return elem.classList.contains(className);
                                                                              }-*/;

    public static native JsArrayString getClassList(Element elem) /*-{
                                                                  return elem.classList;
                                                                  }-*/;

    /**
     * Check to see if a provided URL has the specified root domain (ex. http://a.b.c/foo/bar has
     * root domain of b.c).
     * @param url The URL to test.
     * @param root The root domain to test against.
     * @return True if url has the specified root domain.
     */
    public static boolean hasRootDomain(String url, String root) {
        if (url == null || root == null) {
            return false;
        }
        AnchorElement anchor = Document.get().createAnchorElement();
        anchor.setHref(url);
        String host = anchor.getPropertyString("host");
        return ("." + host).endsWith("." + root);
    }

    /**
     * Split URL parameters into key/value pairs and return them in a map.
     * @param query The query string after the "?".
     * @return Map of all query parameters or an empty map.
     */
    public static Map<String, String> splitUrlParams(String query) {
        if (query == null || query.isEmpty()) {
            return new HashMap<>();
        }
        Map<String, String> paramMap = new HashMap<>();
        String[] params = query.split("&");
        for (int i = 0; i < params.length; i++) {
        }
        for (String currentParam : params) {
            String[] paramSplit = currentParam.split("=");
            if (paramSplit.length > 1) {
                paramMap.put(paramSplit[0], URL.decode(paramSplit[1]));
            }
        }
        return paramMap;
    }

    /**
      * @Return The CSS style of an element after applying the active stylesheets and resolving any
      * basic computation the style's value(s) may contain.
      * @param el - DOM element
    */
    public static native Style getComputedStyle(Element el) /*-{
                                                            return getComputedStyle(el, null);
                                                            }-*/;

    public static boolean isVisible(Element e) {
        Style style = getComputedStyle(e);
        double opacity = JavaScript.parseFloat(style.getOpacity());
        return !(style.getDisplay().equals("none") || style.getVisibility().equals("hidden") || opacity == 0.0F);
    }

    /*
     * We want to use jsni for direct access to javascript's innerText.  This avoids GWT's
     * implementation of Element::getInnerText(), which is intentionally different to mimic an old
     * IE behaviour, which returns text within <script> tags.
     */
    public static native String getInnerText(Node node) /*-{
                                                        return node.innerText;
                                                        }-*/;

    public static native double getTime() /*-{
                                          // window.performance is unavailable in Gwt's dev environment and even referencing it on iOS
                                          // causes a crash.
                                          if ((typeof distiller_on_ios === 'undefined' || !distiller_on_ios) && window.performance) {
                                          return window.performance.now();
                                          }
                                          return Date.now();
                                          }-*/;

    /**
     * Use jsni for direct access to javascript's textContent.  textContent is different from
     * innerText (see http://www.kellegous.com/j/2013/02/27/innertext-vs-textcontent):
     * - textContent is the raw textual content, doesn't require layout, and is basically a
     *   concatenation of the values of all text nodes within a subtree.
     * - innerText is what is presented to the user, requires layout, and excludes text in invisible
     *   elements, e.g. <title> tags.
     */
    public static native String javascriptTextContent(Node node) /*-{
                                                                 return node.textContent;
                                                                 }-*/;

    /**
     * Get a list of all the parents of this node starting with the node itself.
     * @param n The node to get the parents of.
     * @return A list of the provided node's parents.
     */
    public static List<Node> getParentNodes(Node n) {
        ArrayList<Node> result = new ArrayList<Node>();
        Node curr = n;
        while (curr != null) {
            result.add(curr);
            curr = curr.getParentNode();
        }
        return result;
    }

    /**
     * Get the depth of the given node in the DOM tree (only counting elements).
     * @param n The node to find the depth of.
     * @return The depth of the provided node; -1 if n is null.
     */
    public static int getNodeDepth(final Node n) {
        return getParentNodes(n).size() - 1;
    }

    /**
     * Get the nearest common ancestor of two nodes.
     * @param n1 First node.
     * @param n2 Second node.
     * @return The nearest common ancestor node of n1 and n2.
     */
    public static Node getNearestCommonAncestor(final Node n1, final Node n2) {
        Node parent = n1;
        while (parent != null && !JavaScript.contains(parent, n2))
            parent = parent.getParentNode();
        return parent;
    }

    /**
     * Get the nearest common ancestor of nodes.
     */
    public static Node getNearestCommonAncestor(final NodeList ns) {
        if (ns.getLength() == 0)
            return null;
        Node parent = ns.getItem(0);
        for (int i = 1; i < ns.getLength(); i++) {
            parent = getNearestCommonAncestor(parent, ns.getItem(i));
        }
        return parent;
    }

    /**
     * Get all text from a tree/sub-tree.
     * @param node The root of the tree.
     * @return The text contained in this tree.
     */
    public static String getTextFromTree(Node node) {
        // Temporarily add the node to the DOM so that style is calculated.
        Document.get().getBody().appendChild(node);
        String output = DomUtil.getInnerText(node);

        // And remove it again.
        Document.get().getBody().removeChild(node);
        return output;
    }

    /**
     * Generate the HTML output for a list of relevant nodes.
     * @param outputNodes The list of nodes in a subtree that are considered relevant.
     * @param textOnly If this function should return text only instead of HTML.
     * @return Displayable HTML content representing this WebElement.
     */
    public static String generateOutputFromList(List<Node> outputNodes, boolean textOnly) {
        if (outputNodes.size() == 0) {
            return "";
        }

        NodeTree expanded = NodeListExpander.expand(outputNodes);
        Node clonedSubtree = expanded.cloneSubtreeRetainDirection();

        if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE)
            return "";

        stripIds(clonedSubtree);
        makeAllLinksAbsolute(clonedSubtree);
        stripFontColorAttributes(clonedSubtree);
        stripTableBackgroundColorAttributes(clonedSubtree);
        stripStyleAttributes(clonedSubtree);
        stripImageElements(clonedSubtree);

        if (textOnly) {
            return DomUtil.getTextFromTree(clonedSubtree);
        }
        return Element.as(clonedSubtree).getString();
    }

    /**
     * Makes all anchors and video posters absolute. This calls "makeAllSrcAttributesAbsolute".
     * @param rootNode The root Node to look through.
     */
    public static void makeAllLinksAbsolute(Node rootNode) {
        Element root = Element.as(rootNode);

        // AnchorElement.getHref() and ImageElement.getSrc() both return the
        // absolute URI, so simply set them as the respective attributes.

        if ("A".equals(root.getTagName())) {
            AnchorElement link = AnchorElement.as(root);
            if (!link.getHref().isEmpty()) {
                link.setHref(link.getHref());
            }
        }
        NodeList<Element> allLinks = root.getElementsByTagName("A");
        for (int i = 0; i < allLinks.getLength(); i++) {
            AnchorElement link = AnchorElement.as(allLinks.getItem(i));
            if (!link.getHref().isEmpty()) {
                link.setHref(link.getHref());
            }
        }
        if (root.getTagName().equals("VIDEO")) {
            VideoElement video = (VideoElement) root;
            if (!video.getPoster().isEmpty()) {
                video.setPoster(video.getPoster());
            }
        }
        NodeList<Element> videoTags = root.getElementsByTagName("VIDEO");
        for (int i = 0; i < videoTags.getLength(); i++) {
            VideoElement video = (VideoElement) videoTags.getItem(i);
            if (!video.getPoster().isEmpty()) {
                video.setPoster(video.getPoster());
            }
        }
        makeAllSrcAttributesAbsolute(root);

        makeSrcSetAbsolute(root);
    }

    private static void makeSrcSetAbsolute(Element root) {
        if (root.getTagName().equals("IMG")) {
            makeSrcSetAbsolute(ImageElement.as(root));
        }
        NodeList<Element> imgs = DomUtil.querySelectorAll(root, "IMG[SRCSET]");
        for (int i = 0; i < imgs.getLength(); i++) {
            makeSrcSetAbsolute(ImageElement.as(imgs.getItem(i)));
        }
    }

    public static void makeSrcSetAbsolute(ImageElement ie) {
        String srcset = ie.getAttribute("srcset");
        if (srcset == "") {
            ie.removeAttribute("srcset");
            return;
        }

        String oldsrc = ie.getSrc();
        String[] sizes = StringUtil.jsSplit(srcset, ",");
        for (int i = 0; i < sizes.length; i++) {
            String size = StringUtil.jsTrim(sizes[i]);
            if (size.isEmpty())
                continue;
            String[] comp = size.split(" ");
            ie.setSrc(comp[0]);
            comp[0] = ie.getSrc();
            sizes[i] = StringUtil.join(comp, " ");
        }
        ie.setAttribute("srcset", StringUtil.join(sizes, ", "));
        ie.setSrc(oldsrc);
    }

    public static void stripImageElements(Node root) {
        if (root.getNodeType() == Node.ELEMENT_NODE) {
            Element element = Element.as(root);
            if (element.getTagName().equals("IMG")) {
                stripImageElement(ImageElement.as(element));
            }
        }
        NodeList<Element> imgs = DomUtil.querySelectorAll(root, "IMG");
        for (int i = 0; i < imgs.getLength(); i++) {
            stripImageElement(ImageElement.as(imgs.getItem(i)));
        }
    }

    /**
     * Only keep some attributes for image elements.
     * @param ie The image element to strip in-place.
     */
    public static void stripImageElement(ImageElement imgElement) {
        JsArray<Node> attrs = getAttributes(imgElement);
        for (int i = 0; i < attrs.length();) {
            String name = attrs.get(i).getNodeName();
            if (!"src".equals(name) && !"alt".equals(name) && !"srcset".equals(name) && !"dir".equals(name)
                    && !"title".equals(name)) {
                imgElement.removeAttribute(name);
            } else {
                i++;
            }
        }
    }

    /**
     * Makes all "img", "source", "track", and "video" tags have an absolute "src" attribute.
     * @param root The root element to look through.
     */
    public static native void makeAllSrcAttributesAbsolute(Element root) /*-{
                                                                         if (root.tagName == "IMG" || root.tagName == "SOURCE" || root.tagName == "TRACK" ||
                                                                         root.tagName == "VIDEO") {
                                                                         if (root.src) {
                                                                         root.src = root.src;
                                                                         }
                                                                         }
                                                                         var elementsWithSrc = root.querySelectorAll('img,source,track,video');
                                                                         for (var key in elementsWithSrc) {
                                                                         if (elementsWithSrc[key].src) {
                                                                         elementsWithSrc[key].src = elementsWithSrc[key].src;
                                                                         }
                                                                         }
                                                                         }-*/;

    /**
     * Strips some attribute from certain tags in the tree rooted at |rootNode|, including root.
     * @param tagNames The tag names to be processed. ["*"] means all.
     */
    public static void stripAttributeFromTags(Node rootNode, String attribute, String[] tagNames) {
        Element root = Element.as(rootNode);
        for (String tag : tagNames) {
            if (root.getTagName().equals(tag) || tag.equals("*")) {
                root.removeAttribute(attribute);
            }
        }

        for (String tag : tagNames) {
            tag += "[" + attribute + "]";
        }
        String query = StringUtil.join(tagNames, ", ");
        NodeList<Element> tags = DomUtil.querySelectorAll(root, query);
        for (int i = 0; i < tags.getLength(); i++) {
            tags.getItem(i).removeAttribute(attribute);
        }
    }

    /**
     * Strips all "id" attributes from all nodes in the tree rooted at |node|
     */
    public static void stripIds(Node node) {
        stripAttributeFromTags(node, "ID", new String[] { "*" });
    }

    /**
     * Strips all "color" attributes from "font" nodes in the tree rooted at |rootNode|
     */
    public static void stripFontColorAttributes(Node rootNode) {
        stripAttributeFromTags(rootNode, "COLOR", new String[] { "FONT" });
    }

    /**
     * Strips all "bgcolor" attributes from table nodes in the tree rooted at |rootNode|
     */
    public static void stripTableBackgroundColorAttributes(Node rootNode) {
        stripAttributeFromTags(rootNode, "BGCOLOR", new String[] { "TABLE", "TR", "TD", "TH" });
    }

    /**
     * Strips all "style" attributes from all nodes in the tree rooted at |rootNode|
     */
    public static void stripStyleAttributes(Node rootNode) {
        stripAttributeFromTags(rootNode, "STYLE", new String[] { "*" });
    }

    /**
     * Get a list of relevant nodes from a subtree.
     * @param root The root of the subtree.
     * @return A list of relevant nodes.
     */
    public static List<Node> getOutputNodes(Node root) {
        final List<Node> nodes = new ArrayList<>();
        new DomWalker(new DomWalker.Visitor() {
            @Override
            public boolean visit(Node n) {
                switch (n.getNodeType()) {
                case Node.TEXT_NODE:
                    nodes.add(n);
                    return false;
                case Node.ELEMENT_NODE:
                    if (!DomUtil.isVisible(Element.as(n)))
                        return false;
                    nodes.add(n);
                    return true;
                case Node.DOCUMENT_NODE:
                default:
                    return false;
                }
            }

            @Override
            public void exit(Node n) {
            }

            @Override
            public void skip(Element e) {
            }
        }).walk(root);
        return nodes;
    }

    /**
     * Generate HTML/text output for a given node tree/subtree. This will ignore hidden
     * elements.
     * @param subtree The root of the subtree.
     * @param textOnly If this function should return text only and not HTML.
     * @return The output for the provided subtree.
     */
    public static String generateOutputFromTree(Node subtree, boolean textOnly) {
        return generateOutputFromList(getOutputNodes(subtree), textOnly);
    }

    // Returns whether querySelectorAll is available
    public static native boolean supportQuerySelectorAll(Element root) /*-{
                                                                       return (typeof(root.querySelectorAll) == 'function');
                                                                       }-*/;

    // GWT doesn't support querySelectorAll, so testing the caller could be harder.
    public static native NodeList<Element> querySelectorAll(Node l, String selectors) /*-{
                                                                                      return l.querySelectorAll(selectors);
                                                                                      }-*/;

    public static native Document createHTMLDocument(Document doc) /*-{
                                                                   return doc.implementation.createHTMLDocument();
                                                                   }-*/;

    public static native Element getFirstElementChild(Document document) /*-{
                                                                         return document.firstElementChild;
                                                                         }-*/;
}