Example usage for com.google.gwt.dom.client NodeList getLength

List of usage examples for com.google.gwt.dom.client NodeList getLength

Introduction

In this page you can find the example usage for com.google.gwt.dom.client NodeList getLength.

Prototype

public int getLength() 

Source Link

Usage

From source file:org.chromium.distiller.DomUtil.java

License:Open Source License

public static void stripImageElements(Node root) {
    if (root.getNodeType() == Node.ELEMENT_NODE) {
        Element element = Element.as(root);
        if (element.getTagName().equals("IMG")) {
            stripImageElement(ImageElement.as(element));
        }/*from ww  w. j  a  va  2  s . c  o  m*/
    }
    NodeList<Element> imgs = DomUtil.querySelectorAll(root, "IMG");
    for (int i = 0; i < imgs.getLength(); i++) {
        stripImageElement(ImageElement.as(imgs.getItem(i)));
    }
}

From source file:org.chromium.distiller.DomUtil.java

License:Open Source License

/**
 * Strips some attribute from certain tags in the tree rooted at |rootNode|, including root.
 * @param tagNames The tag names to be processed. ["*"] means all.
 *//*  ww w.  j  a v a 2s .  c  o  m*/
public static void stripAttributeFromTags(Node rootNode, String attribute, String[] tagNames) {
    Element root = Element.as(rootNode);
    for (String tag : tagNames) {
        if (root.getTagName().equals(tag) || tag.equals("*")) {
            root.removeAttribute(attribute);
        }
    }

    for (String tag : tagNames) {
        tag += "[" + attribute + "]";
    }
    String query = StringUtil.join(tagNames, ", ");
    NodeList<Element> tags = DomUtil.querySelectorAll(root, query);
    for (int i = 0; i < tags.getLength(); i++) {
        tags.getItem(i).removeAttribute(attribute);
    }
}

From source file:org.chromium.distiller.extractors.embeds.TwitterExtractor.java

License:Open Source License

/**
 * Handle a Twitter embed that has not yet been rendered.
 * @param e The root element of the embed (should be a "blockquote").
 * @return EmbeddedElement object representing the embed or null.
 *///  www  .j  a  v a2 s. c  o  m
private WebEmbed handleNotRendered(Element e) {
    // Make sure the characteristic class name for Twitter exists.
    if (!e.getClassName().contains("twitter-tweet")) {
        return null;
    }

    // Get the last anchor element in this section; it should contain the tweet id.
    NodeList<Element> anchors = e.getElementsByTagName("a");
    if (anchors.getLength() == 0) {
        return null;
    }

    AnchorElement tweetAnchor = AnchorElement.as(anchors.getItem(anchors.getLength() - 1));

    if (!DomUtil.hasRootDomain(tweetAnchor.getHref(), "twitter.com")) {
        return null;
    }

    // Get specific attributes about the Twitter embed.
    String path = tweetAnchor.getPropertyString("pathname");

    String id = getTweetIdFromPath(path);
    if (id == null) {
        return null;
    }

    return new WebEmbed(e, "twitter", id, null);
}

From source file:org.chromium.distiller.extractors.embeds.TwitterExtractor.java

License:Open Source License

/**
 * Handle a Twitter embed that has already been rendered.
 * @param e The root element of the embed (should be an "iframe").
 * @return EmbeddedElement object representing the embed or null.
 *///www. ja  v a2s  .  c  o  m
private WebEmbed handleRendered(Element e) {
    // Twitter embeds are blockquote tags operated on by some javascript.
    if (!"IFRAME".equals(e.getTagName())) {
        return null;
    }
    IFrameElement iframe = IFrameElement.as(e);

    // If the iframe has no "src" attribute, explore further.
    if (!iframe.getSrc().isEmpty()) {
        return null;
    }
    Document iframeDoc = iframe.getContentDocument();
    if (iframeDoc == null) {
        return null;
    }

    // The iframe will contain a blockquote element that has information including tweet id.
    NodeList blocks = iframeDoc.getElementsByTagName("blockquote");
    if (blocks.getLength() < 1) {
        return null;
    }
    Element tweetBlock = Element.as(blocks.getItem(0));

    String id = tweetBlock.getAttribute("data-tweet-id");

    if (id.isEmpty()) {
        return null;
    }

    return new WebEmbed(e, "twitter", id, null);
}

From source file:org.chromium.distiller.OpenGraphProtocolParser.java

License:Open Source License

private void findPrefixes(Element root) {
    String prefixes = "";

    // See if HTML tag has "prefix" attribute.
    if (root.hasTagName("HTML"))
        prefixes = root.getAttribute("prefix");

    // Otherwise, see if HEAD tag has "prefix" attribute.
    if (prefixes.isEmpty()) {
        NodeList<Element> heads = root.getElementsByTagName("HEAD");
        if (heads.getLength() == 1)
            prefixes = heads.getItem(0).getAttribute("prefix");
    }//from   w w w . j  a v  a  2 s.co  m

    // If there's "prefix" attribute, its value is something like
    // "og: http://ogp.me/ns# profile: http://og.me/ns/profile# article:
    // http://ogp.me/ns/article#".
    if (!prefixes.isEmpty()) {
        sOgpNsPrefixRegExp.setLastIndex(0);
        while (true) {
            MatchResult match = sOgpNsPrefixRegExp.exec(prefixes);
            if (match == null)
                break;
            setPrefixForObjectType(match.getGroup(2), match.getGroup(4));
        }
    } else {
        // Still no "prefix" attribute, see if HTMl tag has "xmlns" attributes e.g.:
        // - "xmlns:og="http://ogp.me/ns#"
        // - "xmlns:profile="http://ogp.me/ns/profile#"
        // - "xmlns:article="http://ogp.me/ns/article#".
        final JsArray<Node> attributes = DomUtil.getAttributes(root);
        for (int i = 0; i < attributes.length(); i++) {
            final Node node = attributes.get(i);
            // Look for attribute name that starts with "xmlns:".
            String attributeName = node.getNodeName().toLowerCase();
            MatchResult nameMatch = sOgpNsNonPrefixNameRegExp.exec(attributeName);
            if (nameMatch == null)
                continue;

            // Extract OGP namespace URI from attribute value, if available.
            String attributeValue = node.getNodeValue();
            MatchResult valueMatch = sOgpNsNonPrefixValueRegExp.exec(attributeValue);
            if (valueMatch != null) {
                setPrefixForObjectType(nameMatch.getGroup(1), valueMatch.getGroup(1));
            }
        }
    }

    setDefaultPrefixes();
}

From source file:org.chromium.distiller.OpenGraphProtocolParser.java

License:Open Source License

private void parseMetaTags(Element root) {
    NodeList<Element> allMeta = null;
    if (doPrefixFiltering) {
        // Attribute selectors with prefix
        // https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors
        String query = "";
        for (Map.Entry<Prefix, String> entry : mPrefixes.entrySet()) {
            query += "meta[property^=\"" + entry.getValue() + "\"],";
        }//  ww w  . j  a  v  a 2s .co m
        query = query.substring(0, query.length() - 1);

        allMeta = DomUtil.querySelectorAll(root, query);
    } else {
        allMeta = DomUtil.querySelectorAll(root, "meta[property]");
    }

    for (int i = 0; i < allMeta.getLength(); i++) {
        MetaElement meta = MetaElement.as(allMeta.getItem(i));
        String property = meta.getAttribute("property").toLowerCase();

        // Only store properties that we care about for distillation.
        for (int j = 0; j < mProperties.length; j++) {
            String prefixWithColon = mPrefixes.get(mProperties[j].mPrefix) + ":";
            // Note that property.equals() won't work here because |mProperties| uses "image:"
            // (IMAGE_STRUCT_PROP_PFX) for all image structured properties, so as to prevent
            // repetitive property name comparison - here and then again in ImageParser.
            if (!property.startsWith(prefixWithColon + mProperties[j].mName))
                continue;
            property = property.substring(prefixWithColon.length());

            boolean addProperty = true;
            if (mProperties[j].mParser != null) {
                addProperty = mProperties[j].mParser.parse(property, meta.getContent(), mPropertyTable);
            }
            if (addProperty)
                mPropertyTable.put(mProperties[j].mName, meta.getContent());
        }
    }
}

From source file:org.chromium.distiller.PageParameterParser.java

License:Open Source License

/**
 * Actually implements PageParameterParser.parse(), see above description for parse().
 *//*  w  w w . j a va2 s  .co  m*/
private PageParamInfo parseDocument(Element root, String originalUrl) {
    double startTime = DomUtil.getTime();

    mDocUrl = sHrefCleaner.replace(originalUrl, "");
    mParsedUrl = ParsedUrl.create(mDocUrl);
    if (mParsedUrl == null)
        return new PageParamInfo(); // Invalid document URL.

    AnchorElement baseAnchor = PagingLinksFinder
            .createAnchorWithBase(PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));

    NodeList<Element> allLinks = root.getElementsByTagName("A");
    int idx = 0;
    while (idx < allLinks.getLength()) {
        final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));
        PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
        if (pageInfoAndText == null) {
            idx++;
            continue;
        }

        // This link is a good candidate for pagination.

        // Close current group of adjacent numbers, add a new group if necessary.
        mAdjacentNumbersGroups.addGroup();

        // Before we append the link to the new group of adjacent numbers, check if it's
        // preceded by a text node with numeric text; if so, add it before the link.
        findAndAddClosestValidLeafNodes(link, false, true, null);

        // Add the link to the current group of adjacent numbers.
        mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);

        // Add all following text nodes and links with numeric text.
        mNumForwardLinksProcessed = 0;
        findAndAddClosestValidLeafNodes(link, false, false, baseAnchor);

        // Skip the current link and links already processed in the forward
        // findandAddClosestValidLeafNodes().
        idx += 1 + mNumForwardLinksProcessed;
    } // while there're links.

    mAdjacentNumbersGroups.cleanup();

    LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");

    startTime = DomUtil.getTime();
    PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl);
    LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
    return info;
}

From source file:org.chromium.distiller.PagingLinksFinder.java

License:Open Source License

private static String findPagingLink(Element root, String original_url, PageLink pageLink) {
    // findPagingLink() is static, so clear mLinkDebugInfo before processing the links.
    if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
        mLinkDebugInfo.clear();//w w  w . ja v  a  2s.c  o m
    }

    String folderUrl = StringUtil.findAndReplace(original_url, "\\/[^/]*$", "");

    // Remove trailing '/' from window location href, because it'll be used to compare with
    // other href's whose trailing '/' are also removed.
    String wndLocationHref = StringUtil.findAndReplace(original_url, "\\/$", "");
    NodeList<Element> allLinks = root.getElementsByTagName("A");
    Set<PagingLinkObj> possiblePages = new HashSet<PagingLinkObj>();
    Set<String> bannedUrls = new HashSet<String>();

    AnchorElement baseAnchor = createAnchorWithBase(getBaseUrlForRelative(root, original_url));

    // The trailing "/" is essential to ensure the whole hostname is matched, and not just the
    // prefix of the hostname. It also maintains the requirement of having a "path" in the URL.
    String allowedPrefix = getScheme(original_url) + "://" + getHostname(original_url) + "/";
    RegExp regPrefixNum = RegExp.compile("^" + StringUtil.regexEscape(allowedPrefix) + ".*\\d", "i");

    // Loop through all links, looking for hints that they may be next- or previous- page links.
    // Things like having "page" in their textContent, className or id, or being a child of a
    // node with a page-y className or id.
    // Also possible: levenshtein distance? longest common subsequence?
    // After we do that, assign each page a score.
    for (int i = 0; i < allLinks.getLength(); i++) {
        AnchorElement link = AnchorElement.as(allLinks.getItem(i));

        // Note that AnchorElement.getHref() returns the absolute URI, so there's no need to
        // worry about relative links.
        String linkHref = resolveLinkHref(link, baseAnchor);

        if (pageLink == PageLink.NEXT) {
            if (!regPrefixNum.test(linkHref)) {
                appendDbgStrForLink(link, "ignored: not prefix + num");
                continue;
            }
        } else if (pageLink == PageLink.PREV) {
            if (!linkHref.substring(0, allowedPrefix.length()).equalsIgnoreCase(allowedPrefix)) {
                appendDbgStrForLink(link, "ignored: prefix");
                continue;
            }
        }

        int width = link.getOffsetWidth();
        int height = link.getOffsetHeight();
        if (width == 0 || height == 0) {
            appendDbgStrForLink(link, "ignored: sz=" + width + "x" + height);
            continue;
        }

        if (!DomUtil.isVisible(link)) {
            appendDbgStrForLink(link, "ignored: invisible");
            continue;
        }

        // Remove url anchor and then trailing '/' from link's href.
        linkHref = REG_HREF_CLEANER.replace(linkHref, "");
        appendDbgStrForLink(link, "-> " + linkHref);

        // Ignore page link that is the same as current window location.
        // If the page link is same as the folder URL:
        // - next page link: ignore it, since we would already have seen it.
        // - previous page link: don't ignore it, since some sites will simply have the same
        //                       folder URL for the first page.
        if (linkHref.equalsIgnoreCase(wndLocationHref)
                || (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(folderUrl))) {
            appendDbgStrForLink(link, "ignored: same as current or folder url " + folderUrl);
            continue;
        }

        // Use javascript innerText (instead of javascript textContent) to only get visible
        // text.
        String linkText = DomUtil.getInnerText(link);

        // If the linkText looks like it's not the next or previous page, skip it.
        if (linkText.length() > 25) {
            appendDbgStrForLink(link, "ignored: link text too long");
            continue;
        }

        // If the linkText contains banned text, skip it, and also ban other anchors with the
        // same link URL.
        if (REG_EXTRANEOUS.test(linkText)) {
            appendDbgStrForLink(link, "ignored: one of extra");
            bannedUrls.add(linkHref);
            continue;
        }

        // For next page link, if the initial part of the URL is identical to the folder URL, but
        // the rest of it doesn't contain any digits, it's certainly not a next page link.
        // However, this doesn't apply to previous page link, because most sites will just have
        // the folder URL for the first page.
        // TODO(kuan): do we need to apply this heuristic to previous page links if current page
        // number is not 2?
        if (pageLink == PageLink.NEXT) {
            String linkHrefRemaining = linkHref;
            if (linkHref.startsWith(folderUrl)) {
                linkHrefRemaining = linkHref.substring(folderUrl.length());
            }
            if (!REG_NUMBER.test(linkHrefRemaining)) {
                appendDbgStrForLink(link, "ignored: no number beyond folder url " + folderUrl);
                continue;
            }
        }

        PagingLinkObj linkObj = null;
        linkObj = new PagingLinkObj(i, 0, linkText, linkHref);
        possiblePages.add(linkObj);

        // If the folder URL isn't part of this URL, penalize this link.  It could still be the
        // link, but the odds are lower.
        // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html.
        if (linkHref.indexOf(folderUrl) != 0) {
            linkObj.mScore -= 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": not part of folder url " + folderUrl);
        }

        // Concatenate the link text with class name and id, and determine the score based on
        // existence of various paging-related words.
        String linkData = linkText + " " + link.getClassName() + " " + link.getId();
        appendDbgStrForLink(link, "txt+class+id=" + linkData);
        if (pageLink == PageLink.NEXT ? REG_NEXT_LINK.test(linkData) : REG_PREV_LINK.test(linkData)) {
            linkObj.mScore += 50;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has "
                    + (pageLink == PageLink.NEXT ? "next" : "prev" + " regex"));
        }
        if (REG_PAGINATION.test(linkData)) {
            linkObj.mScore += 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has pag* word");
        }
        if (REG_FIRST_LAST.test(linkData)) {
            // -65 is enough to negate any bonuses gotten from a > or  in the text.
            // If we already matched on "next", last is probably fine.
            // If we didn't, then it's bad.  Penalize.
            // Same for "prev".
            if ((pageLink == PageLink.NEXT && !REG_NEXT_LINK.test(linkObj.mLinkText))
                    || (pageLink == PageLink.PREV && !REG_PREV_LINK.test(linkObj.mLinkText))) {
                linkObj.mScore -= 65;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has first|last but no "
                        + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex");
            }
        }
        if (REG_NEGATIVE.test(linkData) || REG_EXTRANEOUS.test(linkData)) {
            linkObj.mScore -= 50;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has neg or extra regex");
        }
        if (pageLink == PageLink.NEXT ? REG_PREV_LINK.test(linkData) : REG_NEXT_LINK.test(linkData)) {
            linkObj.mScore -= 200;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has opp of "
                    + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex");
        }

        // Check if a parent element contains page or paging or paginate.
        boolean positiveMatch = false, negativeMatch = false;
        Element parent = link.getParentElement();
        while (parent != null && (positiveMatch == false || negativeMatch == false)) {
            String parentClassAndId = parent.getClassName() + " " + parent.getId();
            if (!positiveMatch && REG_PAGINATION.test(parentClassAndId)) {
                linkObj.mScore += 25;
                positiveMatch = true;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": posParent - " + parentClassAndId);
            }
            // TODO(kuan): to get 1st page for prev page link, this can't be applied; however,
            // the non-application might be the cause of recursive prev page being returned,
            // i.e. for page 1, it may incorrectly return page 3 for prev page link.
            if (!negativeMatch && REG_NEGATIVE.test(parentClassAndId)) {
                // If this is just something like "footer", give it a negative.
                // If it's something like "body-and-footer", leave it be.
                if (!REG_POSITIVE.test(parentClassAndId)) {
                    linkObj.mScore -= 25;
                    negativeMatch = true;
                    appendDbgStrForLink(link, "score=" + linkObj.mScore + ": negParent - " + parentClassAndId);
                }
            }
            parent = parent.getParentElement();
        }

        // If the URL looks like it has paging in it, add to the score.
        // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34.
        if (REG_LINK_PAGINATION.test(linkHref) || REG_PAGINATION.test(linkHref)) {
            linkObj.mScore += 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has paging info");
        }

        // If the URL contains negative values, give a slight decrease.
        if (REG_EXTRANEOUS.test(linkHref)) {
            linkObj.mScore -= 15;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has extra regex");
        }

        // If the link text is too long, penalize the link.
        if (linkText.length() > 10) {
            linkObj.mScore -= linkText.length();
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": text too long");
        }

        // If the link text can be parsed as a number, give it a minor bonus, with a slight bias
        // towards lower numbered pages.  This is so that pages that might not have 'next' in
        // their text can still get scored, and sorted properly by score.
        // TODO(kuan): it might be wrong to assume that it knows about other pages in the
        // document and that it starts on the first page.
        int linkTextAsNumber = JavaScript.parseInt(linkText, 10);
        if (linkTextAsNumber > 0) {
            // Punish 1 since we're either already there, or it's probably before what we
            // want anyway.
            if (linkTextAsNumber == 1) {
                linkObj.mScore -= 10;
            } else {
                linkObj.mScore += Math.max(0, 10 - linkTextAsNumber);
            }
            appendDbgStrForLink(link,
                    "score=" + linkObj.mScore + ": linktxt is a num (" + linkTextAsNumber + ")");
        }
        Integer diff = pageDiff(original_url, linkHref, link, allowedPrefix.length());
        if (diff != null) {
            if (((pageLink == PageLink.NEXT) && (diff == 1)) || ((pageLink == PageLink.PREV) && (diff == -1))) {
                linkObj.mScore += 25;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": diff = " + diff);
            }
        }
    } // for all links

    // Loop through all of the possible pages from above and find the top candidate for the next
    // page URL.  Require at least a score of 50, which is a relatively high confidence that
    // this page is the next link.
    PagingLinkObj topPage = null;
    if (!possiblePages.isEmpty()) {
        for (PagingLinkObj pageObj : possiblePages) {
            if (bannedUrls.contains(pageObj.mLinkHref)) {
                continue;
            }
            if (pageObj.mScore >= 50 && (topPage == null || topPage.mScore < pageObj.mScore)) {
                topPage = pageObj;
            }
        }
    }

    String pagingHref = null;
    if (topPage != null) {
        pagingHref = StringUtil.findAndReplace(topPage.mLinkHref, "\\/$", "");
        appendDbgStrForLink(allLinks.getItem(topPage.mLinkIndex),
                "found: score=" + topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pagingHref);
    }

    if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
        logDbgInfoToConsole(pageLink, pagingHref, allLinks);
    }

    return pagingHref;
}

From source file:org.chromium.distiller.PagingLinksFinder.java

License:Open Source License

public static String getBaseUrlForRelative(Element root, String original_url) {
    NodeList<Element> bases = root.getElementsByTagName("BASE");
    if (bases.getLength() == 0) {
        return original_url;
    }/* w w  w  .j  a v  a  2 s.  c om*/
    // Note that base.href can also be relative.
    // If multiple <base> elements are specified, only the first href and
    // first target value are used; all others are ignored.
    // Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
    AnchorElement baseAnchor = createAnchorWithBase(original_url);
    return resolveLinkHref(BaseElement.as(bases.getItem(0)).getAttribute("href"), baseAnchor);
}

From source file:org.chromium.distiller.PagingLinksFinder.java

License:Open Source License

private static void logDbgInfoToConsole(PageLink pageLink, String pagingHref, NodeList<Element> allLinks) {
    // This logs the following to the console:
    // - number of links processed
    // - the next or previous page link found
    // - for each link: its href, text, concatenated debug string.
    // Location of logging output is different when running in different modes:
    // - "ant test.dev": test output file.
    // - chrome browser distiller viewer: chrome logfile.
    // (TODO)kuan): investigate how to get logging when running "ant test.prod" - currently,
    // nothing appears.  In the meantime, throwing an exception with a log message at suspicious
    // codepoints can produce a call stack and help debugging, albeit tediously.
    LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found "
            + (pageLink == PageLink.NEXT ? "next: " : "prev: ") + (pagingHref != null ? pagingHref : "null"));

    for (int i = 0; i < allLinks.getLength(); i++) {
        AnchorElement link = AnchorElement.as(allLinks.getItem(i));

        // Use javascript innerText (instead of javascript textContent) to get only visible
        // text.// w  w  w  .  j a  va 2s . co m
        String text = DomUtil.getInnerText(link);
        // Trim unnecessary white spaces from text.
        String[] words = StringUtil.split(text, "\\s+");
        text = "";
        for (int w = 0; w < words.length; w++) {
            text += words[w];
            if (w < words.length - 1)
                text += " ";
        }

        LogUtil.logToConsole(
                i + ")" + link.getHref() + ", txt=[" + text + "], dbg=[" + mLinkDebugInfo.get(link) + "]");
    }
}