Example usage for com.google.gwt.dom.client NodeList getItem

Introduction

In this page you can find the example usage for com.google.gwt.dom.client NodeList getItem.

Prototype

public T getItem(int index)

Source Link

Usage

From source file:com.dom_distiller.client.IEReadingViewParser.java

License:Open Source License

private void findImages() {
    mImages = new ArrayList<MarkupParser.Image>();

    NodeList<Element> allImages = mRoot.getElementsByTagName("IMG");
    for (int i = 0; i < allImages.getLength(); i++) {
        ImageElement imgElem = ImageElement.as(allImages.getItem(i));

        // As long as the image has a caption, it's relevant regardless of size;
        // otherwise, it's relevant if its size is good.
        String caption = getCaption(imgElem);
        if ((caption != null && !caption.isEmpty()) || isImageRelevantBySize(imgElem)) {
            // Add relevant image to list.
            MarkupParser.Image image = new MarkupParser.Image();
            image.url = imgElem.getSrc();
            image.caption = caption;/*from w  w  w  . j av  a 2s. c o  m*/
            image.width = imgElem.getWidth();
            image.height = imgElem.getHeight();
            mImages.add(image);
        }
    }
}

From source file:com.dom_distiller.client.IEReadingViewParser.java

License:Open Source License

private static String getCaption(ImageElement image) {
    // If |image| is a child of <figure>, then get the <figcaption> elements.
    Element parent = image.getParentElement();
    if (!parent.hasTagName("FIGURE"))
        return "";
    NodeList<Element> captions = parent.getElementsByTagName("FIGCAPTION");
    int numCaptions = captions.getLength();
    String caption = "";
    if (numCaptions > 0 && numCaptions <= 2) {
        // Use javascript innerText (instead of javascript textContent) to get only visible
        // captions.
        for (int i = 0; i < numCaptions && caption.isEmpty(); i++) {
            caption = DomUtil.getInnerText(captions.getItem(i));
        }//w w w  .  j a va2s .co  m
    }
    return caption;
}

From source file:com.dom_distiller.client.IEReadingViewParser.java

License:Open Source License

private static boolean isTextInBody(Element root, String text) {
    String lowerText = text.toLowerCase();
    NodeList<Element> bodies = root.getElementsByTagName("BODY");
    for (int i = 0; i < bodies.getLength(); i++) {
        // Use javascript textContent (instead of javascript innerText) to include invisible
        // text./*  w  w w .ja v a  2  s.c  o m*/
        if (DomUtil.javascriptTextContent(bodies.getItem(i)).toLowerCase().contains(lowerText)) {
            return true;
        }
    }
    return false;
}

From source file:com.dom_distiller.client.OpenGraphProtocolParser.java

License:Open Source License

private void findPrefixes(Element root) {
    String prefixes = "";

    // See if HTML tag has "prefix" attribute.
    if (root.hasTagName("HTML"))
        prefixes = root.getAttribute("prefix");

    // Otherwise, see if HEAD tag has "prefix" attribute.
    if (prefixes.isEmpty()) {
        NodeList<Element> heads = root.getElementsByTagName("HEAD");
        if (heads.getLength() == 1)
            prefixes = heads.getItem(0).getAttribute("prefix");
    }//from   w  w w.  ja va2s  .  c  o m

    // If there's "prefix" attribute, its value is something like
    // "og: http://ogp.me/ns# profile: http://og.me/ns/profile# article: http://ogp.me/ns/article#".
    if (!prefixes.isEmpty()) {
        Matcher matcher = sOgpNsPrefixPattern.matcher(prefixes);
        while (matcher.find()) { // There could be multiple prefixes.
            setPrefixForObjectType(matcher.group(2), matcher.group(4));
        }
    } else {
        // Still no "prefix" attribute, see if HTMl tag has "xmlns" attributes e.g.:
        // - "xmlns:og="http://ogp.me/ns#"
        // - "xmlns:profile="http://ogp.me/ns/profile#"
        // - "xmlns:article="http://ogp.me/ns/article#".
        final JsArray<Node> attributes = DomUtil.getAttributes(root);
        for (int i = 0; i < attributes.length(); i++) {
            final Node node = attributes.get(i);
            // Look for attribute name that starts with "xmlns:".
            String attributeName = node.getNodeName().toLowerCase();
            Matcher nameMatcher = sOgpNsNonPrefixNamePattern.matcher(attributeName);
            if (!nameMatcher.find())
                continue;

            // Extract OGP namespace URI from attribute value, if available.
            String attributeValue = node.getNodeValue();
            Matcher valueMatcher = sOgpNsNonPrefixValuePattern.matcher(attributeValue);
            if (valueMatcher.find()) {
                setPrefixForObjectType(nameMatcher.group(1), valueMatcher.group(1));
            }
        }
    }

    setDefaultPrefixes();
}

From source file:com.dom_distiller.client.OpenGraphProtocolParser.java

License:Open Source License

private void parseMetaTags(Element root) {
    NodeList<Element> allMeta = null;
    if (DomUtil.supportQuerySelectorAll(root)) {
        if (doPrefixFiltering) {
            // Attribute selectors with prefix
            // https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors
            // TODO(wychen): Test the logic here in Chrome on Android.
            String query = "";
            for (Map.Entry<Prefix, String> entry : mPrefixes.entrySet()) {
                query += "meta[property^=\"" + entry.getValue() + "\"],";
            }/*from  w w  w.  j  a  v a 2 s .  c  om*/
            query = query.substring(0, query.length() - 1);

            allMeta = DomUtil.querySelectorAll(root, query);
        } else {
            allMeta = DomUtil.querySelectorAll(root, "meta[property]");
        }
    } else {
        allMeta = root.getElementsByTagName("META");
    }

    for (int i = 0; i < allMeta.getLength(); i++) {
        MetaElement meta = MetaElement.as(allMeta.getItem(i));
        String property = meta.getAttribute("property").toLowerCase();

        // Only store properties that we care about for distillation.
        for (int j = 0; j < mProperties.length; j++) {
            String prefixWithColon = mPrefixes.get(mProperties[j].mPrefix) + ":";
            // Note that property.equals() won't work here because |mProperties| uses "image:"
            // (IMAGE_STRUCT_PROP_PFX) for all image structured properties, so as to prevent
            // repetitive property name comparison - here and then again in ImageParser.
            if (!property.startsWith(prefixWithColon + mProperties[j].mName))
                continue;
            property = property.substring(prefixWithColon.length());

            boolean addProperty = true;
            if (mProperties[j].mParser != null) {
                addProperty = mProperties[j].mParser.parse(property, meta.getContent(), mPropertyTable);
            }
            if (addProperty)
                mPropertyTable.put(mProperties[j].mName, meta.getContent());
        }
    }
}

From source file:com.dom_distiller.client.PagingLinksFinder.java

License:Open Source License

private static String findPagingLink(Element root, String original_domain, PageLink pageLink) {
    // findPagingLink() is static, so clear mLinkDebugInfo before processing the links.
    if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
        mLinkDebugInfo.clear();//  w w  w  .  j ava 2 s  . co  m
    }

    String baseUrl = findBaseUrl(original_domain);
    // Remove trailing '/' from window location href, because it'll be used to compare with
    // other href's whose trailing '/' are also removed.
    String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHref(), "\\/$", "");
    NodeList<Element> allLinks = root.getElementsByTagName("A");
    Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLinkObj>();

    // Loop through all links, looking for hints that they may be next- or previous- page links.
    // Things like having "page" in their textContent, className or id, or being a child of a
    // node with a page-y className or id.
    // Also possible: levenshtein distance? longest common subsequence?
    // After we do that, assign each page a score.
    for (int i = 0; i < allLinks.getLength(); i++) {
        AnchorElement link = AnchorElement.as(allLinks.getItem(i));

        int width = link.getOffsetWidth();
        int height = link.getOffsetHeight();
        if (width == 0 || height == 0) {
            appendDbgStrForLink(link, "ignored: sz=" + width + "x" + height);
            continue;
        }

        if (!DomUtil.isVisible(link)) {
            appendDbgStrForLink(link, "ignored: invisible");
            continue;
        }

        // Remove url anchor and then trailing '/' from link's href.
        // Note that AnchorElement.getHref() returns the absolute URI, so there's no need to
        // worry about relative links.
        String linkHref = StringUtil.findAndReplace(StringUtil.findAndReplace(link.getHref(), "#.*$", ""),
                "\\/$", "");

        // Ignore page link that is empty, not http/https, or same as current window location.
        // If the page link is same as the base URL:
        // - next page link: ignore it, since we would already have seen it.
        // - previous page link: don't ignore it, since some sites will simply have the same
        //                       base URL for the first page.
        if (linkHref.isEmpty() || !StringUtil.match(linkHref, "^https?://")
                || linkHref.equalsIgnoreCase(wndLocationHref)
                || (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(baseUrl))) {
            appendDbgStrForLink(link, "ignored: empty or same as current or base url" + baseUrl);
            continue;
        }

        // If it's on a different domain, skip it.
        String[] urlSlashes = StringUtil.split(linkHref, "\\/+");
        if (urlSlashes.length < 3 || // Expect at least the protocol, domain, and path.
                !getLocationHost(original_domain).equalsIgnoreCase(urlSlashes[1])) {
            appendDbgStrForLink(link, "ignored: different domain");
            continue;
        }

        // Use javascript innerText (instead of javascript textContent) to only get visible
        // text.
        String linkText = DomUtil.getInnerText(link);

        // If the linkText looks like it's not the next or previous page, skip it.
        if (StringUtil.match(linkText, EXTRANEOUS_REGEX) || linkText.length() > 25) {
            appendDbgStrForLink(link, "ignored: one of extra");
            continue;
        }

        // For next page link, if the initial part of the URL is identical to the base URL, but
        // the rest of it doesn't contain any digits, it's certainly not a next page link.
        // However, this doesn't apply to previous page link, because most sites will just have
        // the base URL for the first page.
        // TODO(kuan): baseUrl (returned by findBaseUrl()) is NOT the prefix of the current
        // window location, even though it appears to be so the way it's used here.
        // TODO(kuan): do we need to apply this heuristic to previous page links if current page
        // number is not 2?
        if (pageLink == PageLink.NEXT) {
            String linkHrefRemaining = StringUtil.findAndReplace(linkHref, baseUrl, "");
            if (!StringUtil.match(linkHrefRemaining, "\\d")) {
                appendDbgStrForLink(link, "ignored: no number beyond base url " + baseUrl);
                continue;
            }
        }

        PagingLinkObj linkObj = null;
        if (!possiblePages.containsKey(linkHref)) { // Have not encountered this href.
            linkObj = new PagingLinkObj(i, 0, linkText, linkHref);
            possiblePages.put(linkHref, linkObj);
        } else { // Have already encountered this href, append its text to existing entry's.
            linkObj = possiblePages.get(linkHref);
            linkObj.mLinkText += " | " + linkText;
        }

        // If the base URL isn't part of this URL, penalize this link.  It could still be the
        // link, but the odds are lower.
        // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html.
        // TODO(kuan): again, baseUrl (returned by findBaseUrl()) is NOT the prefix of the
        // current window location, even though it appears to be so the way it's used here.
        if (linkHref.indexOf(baseUrl) != 0) {
            linkObj.mScore -= 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": not part of base url " + baseUrl);
        }

        // Concatenate the link text with class name and id, and determine the score based on
        // existence of various paging-related words.
        String linkData = linkText + " " + link.getClassName() + " " + link.getId();
        appendDbgStrForLink(link, "txt+class+id=" + linkData);
        if (StringUtil.match(linkData, pageLink == PageLink.NEXT ? NEXT_LINK_REGEX : PREV_LINK_REGEX)) {
            linkObj.mScore += 50;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has "
                    + (pageLink == PageLink.NEXT ? "next" : "prev" + " regex"));
        }
        if (StringUtil.match(linkData, "pag(e|ing|inat)")) {
            linkObj.mScore += 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has pag* word");
        }
        if (StringUtil.match(linkData, "(first|last)")) {
            // -65 is enough to negate any bonuses gotten from a > or  in the text.
            // If we already matched on "next", last is probably fine.
            // If we didn't, then it's bad.  Penalize.
            // Same for "prev".
            if ((pageLink == PageLink.NEXT && !StringUtil.match(linkObj.mLinkText, NEXT_LINK_REGEX))
                    || (pageLink == PageLink.PREV && !StringUtil.match(linkObj.mLinkText, PREV_LINK_REGEX))) {
                linkObj.mScore -= 65;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has first|last but no "
                        + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex");
            }
        }
        if (StringUtil.match(linkData, NEGATIVE_REGEX) || StringUtil.match(linkData, EXTRANEOUS_REGEX)) {
            linkObj.mScore -= 50;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has neg or extra regex");
        }
        if (StringUtil.match(linkData, pageLink == PageLink.NEXT ? PREV_LINK_REGEX : NEXT_LINK_REGEX)) {
            linkObj.mScore -= 200;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has opp of "
                    + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex");
        }

        // Check if a parent element contains page or paging or paginate.
        boolean positiveMatch = false, negativeMatch = false;
        Element parent = link.getParentElement();
        while (parent != null && (positiveMatch == false || negativeMatch == false)) {
            String parentClassAndId = parent.getClassName() + " " + parent.getId();
            if (!positiveMatch && StringUtil.match(parentClassAndId, "pag(e|ing|inat)")) {
                linkObj.mScore += 25;
                positiveMatch = true;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": posParent - " + parentClassAndId);
            }
            // TODO(kuan): to get 1st page for prev page link, this can't be applied; however,
            // the non-application might be the cause of recursive prev page being returned,
            // i.e. for page 1, it may incorrectly return page 3 for prev page link.
            if (!negativeMatch && StringUtil.match(parentClassAndId, NEGATIVE_REGEX)) {
                // If this is just something like "footer", give it a negative.
                // If it's something like "body-and-footer", leave it be.
                if (!StringUtil.match(parentClassAndId, POSITIVE_REGEX)) {
                    linkObj.mScore -= 25;
                    negativeMatch = true;
                    appendDbgStrForLink(link, "score=" + linkObj.mScore + ": negParent - " + parentClassAndId);
                }
            }
            parent = parent.getParentElement();
        }

        // If the URL looks like it has paging in it, add to the score.
        // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34.
        if (StringUtil.match(linkHref, "p(a|g|ag)?(e|ing|ination)?(=|\\/)[0-9]{1,2}")
                || StringUtil.match(linkHref, "(page|paging)")) {
            linkObj.mScore += 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has paging info");
        }

        // If the URL contains negative values, give a slight decrease.
        if (StringUtil.match(linkHref, EXTRANEOUS_REGEX)) {
            linkObj.mScore -= 15;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has extra regex");
        }

        // If the link text can be parsed as a number, give it a minor bonus, with a slight bias
        // towards lower numbered pages.  This is so that pages that might not have 'next' in
        // their text can still get scored, and sorted properly by score.
        // TODO(kuan): it might be wrong to assume that it knows about other pages in the
        // document and that it starts on the first page.
        int linkTextAsNumber = 0;
        try {
            linkTextAsNumber = Integer.parseInt(linkText, 10);
        } catch (NumberFormatException e) {
        }
        if (linkTextAsNumber > 0) {
            // Punish 1 since we're either already there, or it's probably before what we
            // want anyway.
            if (linkTextAsNumber == 1) {
                linkObj.mScore -= 10;
            } else {
                linkObj.mScore += Math.max(0, 10 - linkTextAsNumber);
            }
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": linktxt is a num");
        }
    } // for all links

    // Loop through all of the possible pages from above and find the top candidate for the next
    // page URL.  Require at least a score of 50, which is a relatively high confidence that
    // this page is the next link.
    PagingLinkObj topPage = null;
    if (!possiblePages.isEmpty()) {
        Collection<PagingLinkObj> possiblePageObjs = possiblePages.values();
        Iterator<PagingLinkObj> iter = possiblePageObjs.iterator();
        while (iter.hasNext()) {
            PagingLinkObj pageObj = iter.next();
            if (pageObj.mScore >= 50 && (topPage == null || topPage.mScore < pageObj.mScore)) {
                topPage = pageObj;
            }
        }
    }

    String pagingHref = null;
    if (topPage != null) {
        pagingHref = StringUtil.findAndReplace(topPage.mLinkHref, "\\/$", "");
        appendDbgStrForLink(allLinks.getItem(topPage.mLinkIndex),
                "found: score=" + topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pagingHref);
    }

    if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
        logDbgInfoToConsole(pageLink, pagingHref, allLinks);
    }

    return pagingHref;
}

From source file:com.dom_distiller.client.PagingLinksFinder.java

License:Open Source License

private static void logDbgInfoToConsole(PageLink pageLink, String pagingHref, NodeList<Element> allLinks) {
    // This logs the following to the console:
    // - number of links processed
    // - the next or previous page link found
    // - for each link: its href, text, concatenated debug string.
    // Location of logging output is different when running in different modes:
    // - "ant test.dev": test output file.
    // - chrome browser distiller viewer: chrome logfile.
    // (TODO)kuan): investigate how to get logging when running "ant test.prod" - currently,
    // nothing appears.  In the meantime, throwing an exception with a log message at suspicious
    // codepoints can produce a call stack and help debugging, albeit tediously.
    LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found "
            + (pageLink == PageLink.NEXT ? "next: " : "prev: ") + (pagingHref != null ? pagingHref : "null"));

    for (int i = 0; i < allLinks.getLength(); i++) {
        AnchorElement link = AnchorElement.as(allLinks.getItem(i));

        // Use javascript innerText (instead of javascript textContent) to get only visible
        // text.// w w  w.j av  a 2s .  com
        String text = DomUtil.getInnerText(link);
        // Trim unnecessary whitespaces from text.
        String[] words = StringUtil.split(text, "\\s+");
        text = "";
        for (int w = 0; w < words.length; w++) {
            text += words[w];
            if (w < words.length - 1)
                text += " ";
        }

        LogUtil.logToConsole(
                i + ")" + link.getHref() + ", txt=[" + text + "], dbg=[" + mLinkDebugInfo.get(link) + "]");
    }
}

From source file:com.dom_distiller.client.SchemaOrgParser.java

License:Open Source License

private void getElementsWithItemAttribute(Element e, List<Element> allProp) {
    if (e.hasAttribute("ITEMPROP") || e.hasAttribute("ITEMSCOPE")) {
        allProp.add(e);/*  w  w w.j av a2s  .  com*/
    }
    NodeList<Node> children = e.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
        Node child = children.getItem(i);
        if (child.getNodeType() != Node.ELEMENT_NODE)
            continue;
        getElementsWithItemAttribute(Element.as(child), allProp);
    }
}

From source file:com.dom_distiller.client.SchemaOrgParser.java

License:Open Source License

private void getAuthorElements(Element e, List<Element> allAuthors) {
    String tagName = e.getTagName();
    if ((tagName.equalsIgnoreCase("A") || tagName.equalsIgnoreCase("LINK"))
            && e.getAttribute("REL").equalsIgnoreCase(AUTHOR_REL)) {
        allAuthors.add(e);/*from   w ww.j av  a2s. co m*/
    }
    NodeList<Node> children = e.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
        Node child = children.getItem(i);
        if (child.getNodeType() != Node.ELEMENT_NODE)
            continue;
        getAuthorElements(Element.as(child), allAuthors);
    }
}

From source file:com.dom_distiller.client.SchemaOrgParser.java

License:Open Source License

private void parse(Element root) {
    if (DomUtil.supportQuerySelectorAll(root)) {
        NodeList<Element> allProp = DomUtil.querySelectorAll(root, "[ITEMPROP],[ITEMSCOPE]");

        // Root node (html) is not included in the result of querySelectorAll, so need to
        // handle it explicitly here.
        parseElement(root, null);/*  w w w  .j av  a2s. com*/

        for (int i = 0; i < allProp.getLength(); i++) {
            Element e = allProp.getItem(i);
            parseElement(e, getItemScopeParent(e));
        }

        // As per http://schema.org/author (or http://schema.org/Article and search for "author"
        // property), if <a> or <link> tags specify rel="author", extract it.
        allProp = DomUtil.querySelectorAll(root, "A[rel=author],LINK[rel=author]");
        for (int i = 0; i < allProp.getLength(); i++) {
            Element e = allProp.getItem(i);
            if (mAuthorFromRel.isEmpty())
                mAuthorFromRel = getAuthorFromRelAttribute(e);
        }
    } else {
        // Since there's no way to construct NodeList, we cannot share the same code for
        // both paths. However, the logic here is simple enough so that we can consider the
        // path using querySelectorAll is somehow tested, except for the query string of
        // querySelectorAll.
        List<Element> allProp = new ArrayList<Element>();
        getElementsWithItemAttribute(root, allProp);
        for (Element e : allProp) {
            parseElement(e, getItemScopeParent(e));
        }

        allProp = new ArrayList<Element>();
        getAuthorElements(root, allProp);
        for (Element e : allProp) {
            if (mAuthorFromRel.isEmpty())
                mAuthorFromRel = getAuthorFromRelAttribute(e);
        }
    }
}