Example usage for com.google.gwt.dom.client NodeList getLength

Introduction

In this page you can find the example usage for com.google.gwt.dom.client NodeList getLength.

Prototype

public int getLength()

Source Link

Usage

From source file:org.chromium.distiller.DomUtil.java

License:Open Source License

public static void stripImageElements(Node root) {
    if (root.getNodeType() == Node.ELEMENT_NODE) {
        Element element = Element.as(root);
        if (element.getTagName().equals("IMG")) {
            stripImageElement(ImageElement.as(element));
        }/*from ww  w. j  a  va  2  s . c  o  m*/
    }
    NodeList<Element> imgs = DomUtil.querySelectorAll(root, "IMG");
    for (int i = 0; i < imgs.getLength(); i++) {
        stripImageElement(ImageElement.as(imgs.getItem(i)));
    }
}

From source file:org.chromium.distiller.DomUtil.java

License:Open Source License

/**
 * Strips some attribute from certain tags in the tree rooted at |rootNode|, including root.
 * @param tagNames The tag names to be processed. ["*"] means all.
 *//*  ww w.  j  a v a 2s .  c  o  m*/
public static void stripAttributeFromTags(Node rootNode, String attribute, String[] tagNames) {
    Element root = Element.as(rootNode);
    for (String tag : tagNames) {
        if (root.getTagName().equals(tag) || tag.equals("*")) {
            root.removeAttribute(attribute);
        }
    }

    for (String tag : tagNames) {
        tag += "[" + attribute + "]";
    }
    String query = StringUtil.join(tagNames, ", ");
    NodeList<Element> tags = DomUtil.querySelectorAll(root, query);
    for (int i = 0; i < tags.getLength(); i++) {
        tags.getItem(i).removeAttribute(attribute);
    }
}

From source file:org.chromium.distiller.extractors.embeds.TwitterExtractor.java

License:Open Source License

/**
 * Handle a Twitter embed that has not yet been rendered.
 * @param e The root element of the embed (should be a "blockquote").
 * @return EmbeddedElement object representing the embed or null.
 *///  www  .j  a  v a2 s. c  o  m
private WebEmbed handleNotRendered(Element e) {
    // Make sure the characteristic class name for Twitter exists.
    if (!e.getClassName().contains("twitter-tweet")) {
        return null;
    }

    // Get the last anchor element in this section; it should contain the tweet id.
    NodeList<Element> anchors = e.getElementsByTagName("a");
    if (anchors.getLength() == 0) {
        return null;
    }

    AnchorElement tweetAnchor = AnchorElement.as(anchors.getItem(anchors.getLength() - 1));

    if (!DomUtil.hasRootDomain(tweetAnchor.getHref(), "twitter.com")) {
        return null;
    }

    // Get specific attributes about the Twitter embed.
    String path = tweetAnchor.getPropertyString("pathname");

    String id = getTweetIdFromPath(path);
    if (id == null) {
        return null;
    }

    return new WebEmbed(e, "twitter", id, null);
}

From source file:org.chromium.distiller.extractors.embeds.TwitterExtractor.java

License:Open Source License

/**
 * Handle a Twitter embed that has already been rendered.
 * @param e The root element of the embed (should be an "iframe").
 * @return EmbeddedElement object representing the embed or null.
 *///www. ja  v a2s  .  c  o  m
private WebEmbed handleRendered(Element e) {
    // Twitter embeds are blockquote tags operated on by some javascript.
    if (!"IFRAME".equals(e.getTagName())) {
        return null;
    }
    IFrameElement iframe = IFrameElement.as(e);

    // If the iframe has no "src" attribute, explore further.
    if (!iframe.getSrc().isEmpty()) {
        return null;
    }
    Document iframeDoc = iframe.getContentDocument();
    if (iframeDoc == null) {
        return null;
    }

    // The iframe will contain a blockquote element that has information including tweet id.
    NodeList blocks = iframeDoc.getElementsByTagName("blockquote");
    if (blocks.getLength() < 1) {
        return null;
    }
    Element tweetBlock = Element.as(blocks.getItem(0));

    String id = tweetBlock.getAttribute("data-tweet-id");

    if (id.isEmpty()) {
        return null;
    }

    return new WebEmbed(e, "twitter", id, null);
}

From source file:org.chromium.distiller.OpenGraphProtocolParser.java

License:Open Source License

private void findPrefixes(Element root) {
    String prefixes = "";

    // See if HTML tag has "prefix" attribute.
    if (root.hasTagName("HTML"))
        prefixes = root.getAttribute("prefix");

    // Otherwise, see if HEAD tag has "prefix" attribute.
    if (prefixes.isEmpty()) {
        NodeList<Element> heads = root.getElementsByTagName("HEAD");
        if (heads.getLength() == 1)
            prefixes = heads.getItem(0).getAttribute("prefix");
    }//from   w w w . j  a v  a  2 s.co  m

    // If there's "prefix" attribute, its value is something like
    // "og: http://ogp.me/ns# profile: http://og.me/ns/profile# article:
    // http://ogp.me/ns/article#".
    if (!prefixes.isEmpty()) {
        sOgpNsPrefixRegExp.setLastIndex(0);
        while (true) {
            MatchResult match = sOgpNsPrefixRegExp.exec(prefixes);
            if (match == null)
                break;
            setPrefixForObjectType(match.getGroup(2), match.getGroup(4));
        }
    } else {
        // Still no "prefix" attribute, see if HTMl tag has "xmlns" attributes e.g.:
        // - "xmlns:og="http://ogp.me/ns#"
        // - "xmlns:profile="http://ogp.me/ns/profile#"
        // - "xmlns:article="http://ogp.me/ns/article#".
        final JsArray<Node> attributes = DomUtil.getAttributes(root);
        for (int i = 0; i < attributes.length(); i++) {
            final Node node = attributes.get(i);
            // Look for attribute name that starts with "xmlns:".
            String attributeName = node.getNodeName().toLowerCase();
            MatchResult nameMatch = sOgpNsNonPrefixNameRegExp.exec(attributeName);
            if (nameMatch == null)
                continue;

            // Extract OGP namespace URI from attribute value, if available.
            String attributeValue = node.getNodeValue();
            MatchResult valueMatch = sOgpNsNonPrefixValueRegExp.exec(attributeValue);
            if (valueMatch != null) {
                setPrefixForObjectType(nameMatch.getGroup(1), valueMatch.getGroup(1));
            }
        }
    }

    setDefaultPrefixes();
}

From source file:org.chromium.distiller.OpenGraphProtocolParser.java

License:Open Source License

private void parseMetaTags(Element root) {
    NodeList<Element> allMeta = null;
    if (doPrefixFiltering) {
        // Attribute selectors with prefix
        // https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors
        String query = "";
        for (Map.Entry<Prefix, String> entry : mPrefixes.entrySet()) {
            query += "meta[property^=\"" + entry.getValue() + "\"],";
        }//  ww w  . j  a  v  a 2s .co m
        query = query.substring(0, query.length() - 1);

        allMeta = DomUtil.querySelectorAll(root, query);
    } else {
        allMeta = DomUtil.querySelectorAll(root, "meta[property]");
    }

    for (int i = 0; i < allMeta.getLength(); i++) {
        MetaElement meta = MetaElement.as(allMeta.getItem(i));
        String property = meta.getAttribute("property").toLowerCase();

        // Only store properties that we care about for distillation.
        for (int j = 0; j < mProperties.length; j++) {
            String prefixWithColon = mPrefixes.get(mProperties[j].mPrefix) + ":";
            // Note that property.equals() won't work here because |mProperties| uses "image:"
            // (IMAGE_STRUCT_PROP_PFX) for all image structured properties, so as to prevent
            // repetitive property name comparison - here and then again in ImageParser.
            if (!property.startsWith(prefixWithColon + mProperties[j].mName))
                continue;
            property = property.substring(prefixWithColon.length());

            boolean addProperty = true;
            if (mProperties[j].mParser != null) {
                addProperty = mProperties[j].mParser.parse(property, meta.getContent(), mPropertyTable);
            }
            if (addProperty)
                mPropertyTable.put(mProperties[j].mName, meta.getContent());
        }
    }
}

From source file:org.chromium.distiller.PageParameterParser.java

License:Open Source License

/**
 * Actually implements PageParameterParser.parse(), see above description for parse().
 *//*  w  w w . j a va2 s  .co  m*/
private PageParamInfo parseDocument(Element root, String originalUrl) {
    double startTime = DomUtil.getTime();

    mDocUrl = sHrefCleaner.replace(originalUrl, "");
    mParsedUrl = ParsedUrl.create(mDocUrl);
    if (mParsedUrl == null)
        return new PageParamInfo(); // Invalid document URL.

    AnchorElement baseAnchor = PagingLinksFinder
            .createAnchorWithBase(PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));

    NodeList<Element> allLinks = root.getElementsByTagName("A");
    int idx = 0;
    while (idx < allLinks.getLength()) {
        final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));
        PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
        if (pageInfoAndText == null) {
            idx++;
            continue;
        }

        // This link is a good candidate for pagination.

        // Close current group of adjacent numbers, add a new group if necessary.
        mAdjacentNumbersGroups.addGroup();

        // Before we append the link to the new group of adjacent numbers, check if it's
        // preceded by a text node with numeric text; if so, add it before the link.
        findAndAddClosestValidLeafNodes(link, false, true, null);

        // Add the link to the current group of adjacent numbers.
        mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);

        // Add all following text nodes and links with numeric text.
        mNumForwardLinksProcessed = 0;
        findAndAddClosestValidLeafNodes(link, false, false, baseAnchor);

        // Skip the current link and links already processed in the forward
        // findandAddClosestValidLeafNodes().
        idx += 1 + mNumForwardLinksProcessed;
    } // while there're links.

    mAdjacentNumbersGroups.cleanup();

    LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");

    startTime = DomUtil.getTime();
    PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl);
    LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
    return info;
}

From source file:org.chromium.distiller.PagingLinksFinder.java

License:Open Source License

private static String findPagingLink(Element root, String original_url, PageLink pageLink) {
    // findPagingLink() is static, so clear mLinkDebugInfo before processing the links.
    if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
        mLinkDebugInfo.clear();//w w  w . ja v  a  2s.c  o m
    }

    String folderUrl = StringUtil.findAndReplace(original_url, "\\/[^/]*$", "");

    // Remove trailing '/' from window location href, because it'll be used to compare with
    // other href's whose trailing '/' are also removed.
    String wndLocationHref = StringUtil.findAndReplace(original_url, "\\/$", "");
    NodeList<Element> allLinks = root.getElementsByTagName("A");
    Set<PagingLinkObj> possiblePages = new HashSet<PagingLinkObj>();
    Set<String> bannedUrls = new HashSet<String>();

    AnchorElement baseAnchor = createAnchorWithBase(getBaseUrlForRelative(root, original_url));

    // The trailing "/" is essential to ensure the whole hostname is matched, and not just the
    // prefix of the hostname. It also maintains the requirement of having a "path" in the URL.
    String allowedPrefix = getScheme(original_url) + "://" + getHostname(original_url) + "/";
    RegExp regPrefixNum = RegExp.compile("^" + StringUtil.regexEscape(allowedPrefix) + ".*\\d", "i");

    // Loop through all links, looking for hints that they may be next- or previous- page links.
    // Things like having "page" in their textContent, className or id, or being a child of a
    // node with a page-y className or id.
    // Also possible: levenshtein distance? longest common subsequence?
    // After we do that, assign each page a score.
    for (int i = 0; i < allLinks.getLength(); i++) {
        AnchorElement link = AnchorElement.as(allLinks.getItem(i));

        // Note that AnchorElement.getHref() returns the absolute URI, so there's no need to
        // worry about relative links.
        String linkHref = resolveLinkHref(link, baseAnchor);

        if (pageLink == PageLink.NEXT) {
            if (!regPrefixNum.test(linkHref)) {
                appendDbgStrForLink(link, "ignored: not prefix + num");
                continue;
            }
        } else if (pageLink == PageLink.PREV) {
            if (!linkHref.substring(0, allowedPrefix.length()).equalsIgnoreCase(allowedPrefix)) {
                appendDbgStrForLink(link, "ignored: prefix");
                continue;
            }
        }

        int width = link.getOffsetWidth();
        int height = link.getOffsetHeight();
        if (width == 0 || height == 0) {
            appendDbgStrForLink(link, "ignored: sz=" + width + "x" + height);
            continue;
        }

        if (!DomUtil.isVisible(link)) {
            appendDbgStrForLink(link, "ignored: invisible");
            continue;
        }

        // Remove url anchor and then trailing '/' from link's href.
        linkHref = REG_HREF_CLEANER.replace(linkHref, "");
        appendDbgStrForLink(link, "-> " + linkHref);

        // Ignore page link that is the same as current window location.
        // If the page link is same as the folder URL:
        // - next page link: ignore it, since we would already have seen it.
        // - previous page link: don't ignore it, since some sites will simply have the same
        //                       folder URL for the first page.
        if (linkHref.equalsIgnoreCase(wndLocationHref)
                || (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(folderUrl))) {
            appendDbgStrForLink(link, "ignored: same as current or folder url " + folderUrl);
            continue;
        }

        // Use javascript innerText (instead of javascript textContent) to only get visible
        // text.
        String linkText = DomUtil.getInnerText(link);

        // If the linkText looks like it's not the next or previous page, skip it.
        if (linkText.length() > 25) {
            appendDbgStrForLink(link, "ignored: link text too long");
            continue;
        }

        // If the linkText contains banned text, skip it, and also ban other anchors with the
        // same link URL.
        if (REG_EXTRANEOUS.test(linkText)) {
            appendDbgStrForLink(link, "ignored: one of extra");
            bannedUrls.add(linkHref);
            continue;
        }

        // For next page link, if the initial part of the URL is identical to the folder URL, but
        // the rest of it doesn't contain any digits, it's certainly not a next page link.
        // However, this doesn't apply to previous page link, because most sites will just have
        // the folder URL for the first page.
        // TODO(kuan): do we need to apply this heuristic to previous page links if current page
        // number is not 2?
        if (pageLink == PageLink.NEXT) {
            String linkHrefRemaining = linkHref;
            if (linkHref.startsWith(folderUrl)) {
                linkHrefRemaining = linkHref.substring(folderUrl.length());
            }
            if (!REG_NUMBER.test(linkHrefRemaining)) {
                appendDbgStrForLink(link, "ignored: no number beyond folder url " + folderUrl);
                continue;
            }
        }

        PagingLinkObj linkObj = null;
        linkObj = new PagingLinkObj(i, 0, linkText, linkHref);
        possiblePages.add(linkObj);

        // If the folder URL isn't part of this URL, penalize this link.  It could still be the
        // link, but the odds are lower.
        // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html.
        if (linkHref.indexOf(folderUrl) != 0) {
            linkObj.mScore -= 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": not part of folder url " + folderUrl);
        }

        // Concatenate the link text with class name and id, and determine the score based on
        // existence of various paging-related words.
        String linkData = linkText + " " + link.getClassName() + " " + link.getId();
        appendDbgStrForLink(link, "txt+class+id=" + linkData);
        if (pageLink == PageLink.NEXT ? REG_NEXT_LINK.test(linkData) : REG_PREV_LINK.test(linkData)) {
            linkObj.mScore += 50;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has "
                    + (pageLink == PageLink.NEXT ? "next" : "prev" + " regex"));
        }
        if (REG_PAGINATION.test(linkData)) {
            linkObj.mScore += 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has pag* word");
        }
        if (REG_FIRST_LAST.test(linkData)) {
            // -65 is enough to negate any bonuses gotten from a > or  in the text.
            // If we already matched on "next", last is probably fine.
            // If we didn't, then it's bad.  Penalize.
            // Same for "prev".
            if ((pageLink == PageLink.NEXT && !REG_NEXT_LINK.test(linkObj.mLinkText))
                    || (pageLink == PageLink.PREV && !REG_PREV_LINK.test(linkObj.mLinkText))) {
                linkObj.mScore -= 65;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has first|last but no "
                        + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex");
            }
        }
        if (REG_NEGATIVE.test(linkData) || REG_EXTRANEOUS.test(linkData)) {
            linkObj.mScore -= 50;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has neg or extra regex");
        }
        if (pageLink == PageLink.NEXT ? REG_PREV_LINK.test(linkData) : REG_NEXT_LINK.test(linkData)) {
            linkObj.mScore -= 200;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has opp of "
                    + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex");
        }

        // Check if a parent element contains page or paging or paginate.
        boolean positiveMatch = false, negativeMatch = false;
        Element parent = link.getParentElement();
        while (parent != null && (positiveMatch == false || negativeMatch == false)) {
            String parentClassAndId = parent.getClassName() + " " + parent.getId();
            if (!positiveMatch && REG_PAGINATION.test(parentClassAndId)) {
                linkObj.mScore += 25;
                positiveMatch = true;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": posParent - " + parentClassAndId);
            }
            // TODO(kuan): to get 1st page for prev page link, this can't be applied; however,
            // the non-application might be the cause of recursive prev page being returned,
            // i.e. for page 1, it may incorrectly return page 3 for prev page link.
            if (!negativeMatch && REG_NEGATIVE.test(parentClassAndId)) {
                // If this is just something like "footer", give it a negative.
                // If it's something like "body-and-footer", leave it be.
                if (!REG_POSITIVE.test(parentClassAndId)) {
                    linkObj.mScore -= 25;
                    negativeMatch = true;
                    appendDbgStrForLink(link, "score=" + linkObj.mScore + ": negParent - " + parentClassAndId);
                }
            }
            parent = parent.getParentElement();
        }

        // If the URL looks like it has paging in it, add to the score.
        // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34.
        if (REG_LINK_PAGINATION.test(linkHref) || REG_PAGINATION.test(linkHref)) {
            linkObj.mScore += 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has paging info");
        }

        // If the URL contains negative values, give a slight decrease.
        if (REG_EXTRANEOUS.test(linkHref)) {
            linkObj.mScore -= 15;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has extra regex");
        }

        // If the link text is too long, penalize the link.
        if (linkText.length() > 10) {
            linkObj.mScore -= linkText.length();
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": text too long");
        }

        // If the link text can be parsed as a number, give it a minor bonus, with a slight bias
        // towards lower numbered pages.  This is so that pages that might not have 'next' in
        // their text can still get scored, and sorted properly by score.
        // TODO(kuan): it might be wrong to assume that it knows about other pages in the
        // document and that it starts on the first page.
        int linkTextAsNumber = JavaScript.parseInt(linkText, 10);
        if (linkTextAsNumber > 0) {
            // Punish 1 since we're either already there, or it's probably before what we
            // want anyway.
            if (linkTextAsNumber == 1) {
                linkObj.mScore -= 10;
            } else {
                linkObj.mScore += Math.max(0, 10 - linkTextAsNumber);
            }
            appendDbgStrForLink(link,
                    "score=" + linkObj.mScore + ": linktxt is a num (" + linkTextAsNumber + ")");
        }
        Integer diff = pageDiff(original_url, linkHref, link, allowedPrefix.length());
        if (diff != null) {
            if (((pageLink == PageLink.NEXT) && (diff == 1)) || ((pageLink == PageLink.PREV) && (diff == -1))) {
                linkObj.mScore += 25;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": diff = " + diff);
            }
        }
    } // for all links

    // Loop through all of the possible pages from above and find the top candidate for the next
    // page URL.  Require at least a score of 50, which is a relatively high confidence that
    // this page is the next link.
    PagingLinkObj topPage = null;
    if (!possiblePages.isEmpty()) {
        for (PagingLinkObj pageObj : possiblePages) {
            if (bannedUrls.contains(pageObj.mLinkHref)) {
                continue;
            }
            if (pageObj.mScore >= 50 && (topPage == null || topPage.mScore < pageObj.mScore)) {
                topPage = pageObj;
            }
        }
    }

    String pagingHref = null;
    if (topPage != null) {
        pagingHref = StringUtil.findAndReplace(topPage.mLinkHref, "\\/$", "");
        appendDbgStrForLink(allLinks.getItem(topPage.mLinkIndex),
                "found: score=" + topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pagingHref);
    }

    if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
        logDbgInfoToConsole(pageLink, pagingHref, allLinks);
    }

    return pagingHref;
}

From source file:org.chromium.distiller.PagingLinksFinder.java

License:Open Source License

public static String getBaseUrlForRelative(Element root, String original_url) {
    NodeList<Element> bases = root.getElementsByTagName("BASE");
    if (bases.getLength() == 0) {
        return original_url;
    }/* w w  w  .j  a v  a  2 s.  c om*/
    // Note that base.href can also be relative.
    // If multiple <base> elements are specified, only the first href and
    // first target value are used; all others are ignored.
    // Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
    AnchorElement baseAnchor = createAnchorWithBase(original_url);
    return resolveLinkHref(BaseElement.as(bases.getItem(0)).getAttribute("href"), baseAnchor);
}

From source file:org.chromium.distiller.PagingLinksFinder.java

License:Open Source License

private static void logDbgInfoToConsole(PageLink pageLink, String pagingHref, NodeList<Element> allLinks) {
    // This logs the following to the console:
    // - number of links processed
    // - the next or previous page link found
    // - for each link: its href, text, concatenated debug string.
    // Location of logging output is different when running in different modes:
    // - "ant test.dev": test output file.
    // - chrome browser distiller viewer: chrome logfile.
    // (TODO)kuan): investigate how to get logging when running "ant test.prod" - currently,
    // nothing appears.  In the meantime, throwing an exception with a log message at suspicious
    // codepoints can produce a call stack and help debugging, albeit tediously.
    LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found "
            + (pageLink == PageLink.NEXT ? "next: " : "prev: ") + (pagingHref != null ? pagingHref : "null"));

    for (int i = 0; i < allLinks.getLength(); i++) {
        AnchorElement link = AnchorElement.as(allLinks.getItem(i));

        // Use javascript innerText (instead of javascript textContent) to get only visible
        // text.// w  w  w  .  j a  va 2s . co m
        String text = DomUtil.getInnerText(link);
        // Trim unnecessary white spaces from text.
        String[] words = StringUtil.split(text, "\\s+");
        text = "";
        for (int w = 0; w < words.length; w++) {
            text += words[w];
            if (w < words.length - 1)
                text += " ";
        }

        LogUtil.logToConsole(
                i + ")" + link.getHref() + ", txt=[" + text + "], dbg=[" + mLinkDebugInfo.get(link) + "]");
    }
}