Example usage for com.google.gwt.dom.client NodeList getLength

List of usage examples for com.google.gwt.dom.client NodeList getLength

Introduction

In this page you can find the example usage for com.google.gwt.dom.client NodeList getLength.

Prototype

public int getLength() 

Source Link

Usage

From source file:com.dom_distiller.client.IEReadingViewParser.java

License:Open Source License

private void findPublisher() {
    mPublisher = "";

    // Look for "publisher" or "source_organization" attribute in any html tag.
    NodeList<Element> allElems = mRoot.getElementsByTagName("*");
    for (int i = 0; i < allElems.getLength() && mPublisher.isEmpty(); i++) {
        Element e = allElems.getItem(i);
        mPublisher = e.getAttribute("publisher");
        if (mPublisher.isEmpty())
            mPublisher = e.getAttribute("source_organization");
    }//from  ww w  . j av a 2 s . c  o  m
}

From source file:com.dom_distiller.client.IEReadingViewParser.java

License:Open Source License

private void findImages() {
    mImages = new ArrayList<MarkupParser.Image>();

    NodeList<Element> allImages = mRoot.getElementsByTagName("IMG");
    for (int i = 0; i < allImages.getLength(); i++) {
        ImageElement imgElem = ImageElement.as(allImages.getItem(i));

        // As long as the image has a caption, it's relevant regardless of size;
        // otherwise, it's relevant if its size is good.
        String caption = getCaption(imgElem);
        if ((caption != null && !caption.isEmpty()) || isImageRelevantBySize(imgElem)) {
            // Add relevant image to list.
            MarkupParser.Image image = new MarkupParser.Image();
            image.url = imgElem.getSrc();
            image.caption = caption;/*from   w  ww .  ja v a  2  s.co m*/
            image.width = imgElem.getWidth();
            image.height = imgElem.getHeight();
            mImages.add(image);
        }
    }
}

From source file:com.dom_distiller.client.IEReadingViewParser.java

License:Open Source License

private static String getCaption(ImageElement image) {
    // If |image| is a child of <figure>, then get the <figcaption> elements.
    Element parent = image.getParentElement();
    if (!parent.hasTagName("FIGURE"))
        return "";
    NodeList<Element> captions = parent.getElementsByTagName("FIGCAPTION");
    int numCaptions = captions.getLength();
    String caption = "";
    if (numCaptions > 0 && numCaptions <= 2) {
        // Use javascript innerText (instead of javascript textContent) to get only visible
        // captions.
        for (int i = 0; i < numCaptions && caption.isEmpty(); i++) {
            caption = DomUtil.getInnerText(captions.getItem(i));
        }//from w  w  w  .j  a  va  2  s. co m
    }
    return caption;
}

From source file:com.dom_distiller.client.IEReadingViewParser.java

License:Open Source License

private static boolean isTextInBody(Element root, String text) {
    String lowerText = text.toLowerCase();
    NodeList<Element> bodies = root.getElementsByTagName("BODY");
    for (int i = 0; i < bodies.getLength(); i++) {
        // Use javascript textContent (instead of javascript innerText) to include invisible
        // text./*ww w .j  a  v  a2 s .c  o  m*/
        if (DomUtil.javascriptTextContent(bodies.getItem(i)).toLowerCase().contains(lowerText)) {
            return true;
        }
    }
    return false;
}

From source file:com.dom_distiller.client.OpenGraphProtocolParser.java

License:Open Source License

private void findPrefixes(Element root) {
    String prefixes = "";

    // See if HTML tag has "prefix" attribute.
    if (root.hasTagName("HTML"))
        prefixes = root.getAttribute("prefix");

    // Otherwise, see if HEAD tag has "prefix" attribute.
    if (prefixes.isEmpty()) {
        NodeList<Element> heads = root.getElementsByTagName("HEAD");
        if (heads.getLength() == 1)
            prefixes = heads.getItem(0).getAttribute("prefix");
    }//from ww  w  . ja va  2s . co m

    // If there's "prefix" attribute, its value is something like
    // "og: http://ogp.me/ns# profile: http://og.me/ns/profile# article: http://ogp.me/ns/article#".
    if (!prefixes.isEmpty()) {
        Matcher matcher = sOgpNsPrefixPattern.matcher(prefixes);
        while (matcher.find()) { // There could be multiple prefixes.
            setPrefixForObjectType(matcher.group(2), matcher.group(4));
        }
    } else {
        // Still no "prefix" attribute, see if HTMl tag has "xmlns" attributes e.g.:
        // - "xmlns:og="http://ogp.me/ns#"
        // - "xmlns:profile="http://ogp.me/ns/profile#"
        // - "xmlns:article="http://ogp.me/ns/article#".
        final JsArray<Node> attributes = DomUtil.getAttributes(root);
        for (int i = 0; i < attributes.length(); i++) {
            final Node node = attributes.get(i);
            // Look for attribute name that starts with "xmlns:".
            String attributeName = node.getNodeName().toLowerCase();
            Matcher nameMatcher = sOgpNsNonPrefixNamePattern.matcher(attributeName);
            if (!nameMatcher.find())
                continue;

            // Extract OGP namespace URI from attribute value, if available.
            String attributeValue = node.getNodeValue();
            Matcher valueMatcher = sOgpNsNonPrefixValuePattern.matcher(attributeValue);
            if (valueMatcher.find()) {
                setPrefixForObjectType(nameMatcher.group(1), valueMatcher.group(1));
            }
        }
    }

    setDefaultPrefixes();
}

From source file:com.dom_distiller.client.OpenGraphProtocolParser.java

License:Open Source License

private void parseMetaTags(Element root) {
    NodeList<Element> allMeta = null;
    if (DomUtil.supportQuerySelectorAll(root)) {
        if (doPrefixFiltering) {
            // Attribute selectors with prefix
            // https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors
            // TODO(wychen): Test the logic here in Chrome on Android.
            String query = "";
            for (Map.Entry<Prefix, String> entry : mPrefixes.entrySet()) {
                query += "meta[property^=\"" + entry.getValue() + "\"],";
            }/*from w ww  . ja  v a2s .com*/
            query = query.substring(0, query.length() - 1);

            allMeta = DomUtil.querySelectorAll(root, query);
        } else {
            allMeta = DomUtil.querySelectorAll(root, "meta[property]");
        }
    } else {
        allMeta = root.getElementsByTagName("META");
    }

    for (int i = 0; i < allMeta.getLength(); i++) {
        MetaElement meta = MetaElement.as(allMeta.getItem(i));
        String property = meta.getAttribute("property").toLowerCase();

        // Only store properties that we care about for distillation.
        for (int j = 0; j < mProperties.length; j++) {
            String prefixWithColon = mPrefixes.get(mProperties[j].mPrefix) + ":";
            // Note that property.equals() won't work here because |mProperties| uses "image:"
            // (IMAGE_STRUCT_PROP_PFX) for all image structured properties, so as to prevent
            // repetitive property name comparison - here and then again in ImageParser.
            if (!property.startsWith(prefixWithColon + mProperties[j].mName))
                continue;
            property = property.substring(prefixWithColon.length());

            boolean addProperty = true;
            if (mProperties[j].mParser != null) {
                addProperty = mProperties[j].mParser.parse(property, meta.getContent(), mPropertyTable);
            }
            if (addProperty)
                mPropertyTable.put(mProperties[j].mName, meta.getContent());
        }
    }
}

From source file:com.dom_distiller.client.PagingLinksFinder.java

License:Open Source License

private static String findPagingLink(Element root, String original_domain, PageLink pageLink) {
    // findPagingLink() is static, so clear mLinkDebugInfo before processing the links.
    if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
        mLinkDebugInfo.clear();/*from  w ww.  j a  v  a 2 s . co m*/
    }

    String baseUrl = findBaseUrl(original_domain);
    // Remove trailing '/' from window location href, because it'll be used to compare with
    // other href's whose trailing '/' are also removed.
    String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHref(), "\\/$", "");
    NodeList<Element> allLinks = root.getElementsByTagName("A");
    Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLinkObj>();

    // Loop through all links, looking for hints that they may be next- or previous- page links.
    // Things like having "page" in their textContent, className or id, or being a child of a
    // node with a page-y className or id.
    // Also possible: levenshtein distance? longest common subsequence?
    // After we do that, assign each page a score.
    for (int i = 0; i < allLinks.getLength(); i++) {
        AnchorElement link = AnchorElement.as(allLinks.getItem(i));

        int width = link.getOffsetWidth();
        int height = link.getOffsetHeight();
        if (width == 0 || height == 0) {
            appendDbgStrForLink(link, "ignored: sz=" + width + "x" + height);
            continue;
        }

        if (!DomUtil.isVisible(link)) {
            appendDbgStrForLink(link, "ignored: invisible");
            continue;
        }

        // Remove url anchor and then trailing '/' from link's href.
        // Note that AnchorElement.getHref() returns the absolute URI, so there's no need to
        // worry about relative links.
        String linkHref = StringUtil.findAndReplace(StringUtil.findAndReplace(link.getHref(), "#.*$", ""),
                "\\/$", "");

        // Ignore page link that is empty, not http/https, or same as current window location.
        // If the page link is same as the base URL:
        // - next page link: ignore it, since we would already have seen it.
        // - previous page link: don't ignore it, since some sites will simply have the same
        //                       base URL for the first page.
        if (linkHref.isEmpty() || !StringUtil.match(linkHref, "^https?://")
                || linkHref.equalsIgnoreCase(wndLocationHref)
                || (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(baseUrl))) {
            appendDbgStrForLink(link, "ignored: empty or same as current or base url" + baseUrl);
            continue;
        }

        // If it's on a different domain, skip it.
        String[] urlSlashes = StringUtil.split(linkHref, "\\/+");
        if (urlSlashes.length < 3 || // Expect at least the protocol, domain, and path.
                !getLocationHost(original_domain).equalsIgnoreCase(urlSlashes[1])) {
            appendDbgStrForLink(link, "ignored: different domain");
            continue;
        }

        // Use javascript innerText (instead of javascript textContent) to only get visible
        // text.
        String linkText = DomUtil.getInnerText(link);

        // If the linkText looks like it's not the next or previous page, skip it.
        if (StringUtil.match(linkText, EXTRANEOUS_REGEX) || linkText.length() > 25) {
            appendDbgStrForLink(link, "ignored: one of extra");
            continue;
        }

        // For next page link, if the initial part of the URL is identical to the base URL, but
        // the rest of it doesn't contain any digits, it's certainly not a next page link.
        // However, this doesn't apply to previous page link, because most sites will just have
        // the base URL for the first page.
        // TODO(kuan): baseUrl (returned by findBaseUrl()) is NOT the prefix of the current
        // window location, even though it appears to be so the way it's used here.
        // TODO(kuan): do we need to apply this heuristic to previous page links if current page
        // number is not 2?
        if (pageLink == PageLink.NEXT) {
            String linkHrefRemaining = StringUtil.findAndReplace(linkHref, baseUrl, "");
            if (!StringUtil.match(linkHrefRemaining, "\\d")) {
                appendDbgStrForLink(link, "ignored: no number beyond base url " + baseUrl);
                continue;
            }
        }

        PagingLinkObj linkObj = null;
        if (!possiblePages.containsKey(linkHref)) { // Have not encountered this href.
            linkObj = new PagingLinkObj(i, 0, linkText, linkHref);
            possiblePages.put(linkHref, linkObj);
        } else { // Have already encountered this href, append its text to existing entry's.
            linkObj = possiblePages.get(linkHref);
            linkObj.mLinkText += " | " + linkText;
        }

        // If the base URL isn't part of this URL, penalize this link.  It could still be the
        // link, but the odds are lower.
        // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html.
        // TODO(kuan): again, baseUrl (returned by findBaseUrl()) is NOT the prefix of the
        // current window location, even though it appears to be so the way it's used here.
        if (linkHref.indexOf(baseUrl) != 0) {
            linkObj.mScore -= 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": not part of base url " + baseUrl);
        }

        // Concatenate the link text with class name and id, and determine the score based on
        // existence of various paging-related words.
        String linkData = linkText + " " + link.getClassName() + " " + link.getId();
        appendDbgStrForLink(link, "txt+class+id=" + linkData);
        if (StringUtil.match(linkData, pageLink == PageLink.NEXT ? NEXT_LINK_REGEX : PREV_LINK_REGEX)) {
            linkObj.mScore += 50;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has "
                    + (pageLink == PageLink.NEXT ? "next" : "prev" + " regex"));
        }
        if (StringUtil.match(linkData, "pag(e|ing|inat)")) {
            linkObj.mScore += 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has pag* word");
        }
        if (StringUtil.match(linkData, "(first|last)")) {
            // -65 is enough to negate any bonuses gotten from a > or  in the text.
            // If we already matched on "next", last is probably fine.
            // If we didn't, then it's bad.  Penalize.
            // Same for "prev".
            if ((pageLink == PageLink.NEXT && !StringUtil.match(linkObj.mLinkText, NEXT_LINK_REGEX))
                    || (pageLink == PageLink.PREV && !StringUtil.match(linkObj.mLinkText, PREV_LINK_REGEX))) {
                linkObj.mScore -= 65;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has first|last but no "
                        + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex");
            }
        }
        if (StringUtil.match(linkData, NEGATIVE_REGEX) || StringUtil.match(linkData, EXTRANEOUS_REGEX)) {
            linkObj.mScore -= 50;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has neg or extra regex");
        }
        if (StringUtil.match(linkData, pageLink == PageLink.NEXT ? PREV_LINK_REGEX : NEXT_LINK_REGEX)) {
            linkObj.mScore -= 200;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has opp of "
                    + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex");
        }

        // Check if a parent element contains page or paging or paginate.
        boolean positiveMatch = false, negativeMatch = false;
        Element parent = link.getParentElement();
        while (parent != null && (positiveMatch == false || negativeMatch == false)) {
            String parentClassAndId = parent.getClassName() + " " + parent.getId();
            if (!positiveMatch && StringUtil.match(parentClassAndId, "pag(e|ing|inat)")) {
                linkObj.mScore += 25;
                positiveMatch = true;
                appendDbgStrForLink(link, "score=" + linkObj.mScore + ": posParent - " + parentClassAndId);
            }
            // TODO(kuan): to get 1st page for prev page link, this can't be applied; however,
            // the non-application might be the cause of recursive prev page being returned,
            // i.e. for page 1, it may incorrectly return page 3 for prev page link.
            if (!negativeMatch && StringUtil.match(parentClassAndId, NEGATIVE_REGEX)) {
                // If this is just something like "footer", give it a negative.
                // If it's something like "body-and-footer", leave it be.
                if (!StringUtil.match(parentClassAndId, POSITIVE_REGEX)) {
                    linkObj.mScore -= 25;
                    negativeMatch = true;
                    appendDbgStrForLink(link, "score=" + linkObj.mScore + ": negParent - " + parentClassAndId);
                }
            }
            parent = parent.getParentElement();
        }

        // If the URL looks like it has paging in it, add to the score.
        // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34.
        if (StringUtil.match(linkHref, "p(a|g|ag)?(e|ing|ination)?(=|\\/)[0-9]{1,2}")
                || StringUtil.match(linkHref, "(page|paging)")) {
            linkObj.mScore += 25;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has paging info");
        }

        // If the URL contains negative values, give a slight decrease.
        if (StringUtil.match(linkHref, EXTRANEOUS_REGEX)) {
            linkObj.mScore -= 15;
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has extra regex");
        }

        // If the link text can be parsed as a number, give it a minor bonus, with a slight bias
        // towards lower numbered pages.  This is so that pages that might not have 'next' in
        // their text can still get scored, and sorted properly by score.
        // TODO(kuan): it might be wrong to assume that it knows about other pages in the
        // document and that it starts on the first page.
        int linkTextAsNumber = 0;
        try {
            linkTextAsNumber = Integer.parseInt(linkText, 10);
        } catch (NumberFormatException e) {
        }
        if (linkTextAsNumber > 0) {
            // Punish 1 since we're either already there, or it's probably before what we
            // want anyway.
            if (linkTextAsNumber == 1) {
                linkObj.mScore -= 10;
            } else {
                linkObj.mScore += Math.max(0, 10 - linkTextAsNumber);
            }
            appendDbgStrForLink(link, "score=" + linkObj.mScore + ": linktxt is a num");
        }
    } // for all links

    // Loop through all of the possible pages from above and find the top candidate for the next
    // page URL.  Require at least a score of 50, which is a relatively high confidence that
    // this page is the next link.
    PagingLinkObj topPage = null;
    if (!possiblePages.isEmpty()) {
        Collection<PagingLinkObj> possiblePageObjs = possiblePages.values();
        Iterator<PagingLinkObj> iter = possiblePageObjs.iterator();
        while (iter.hasNext()) {
            PagingLinkObj pageObj = iter.next();
            if (pageObj.mScore >= 50 && (topPage == null || topPage.mScore < pageObj.mScore)) {
                topPage = pageObj;
            }
        }
    }

    String pagingHref = null;
    if (topPage != null) {
        pagingHref = StringUtil.findAndReplace(topPage.mLinkHref, "\\/$", "");
        appendDbgStrForLink(allLinks.getItem(topPage.mLinkIndex),
                "found: score=" + topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pagingHref);
    }

    if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
        logDbgInfoToConsole(pageLink, pagingHref, allLinks);
    }

    return pagingHref;
}

From source file:com.dom_distiller.client.PagingLinksFinder.java

License:Open Source License

private static void logDbgInfoToConsole(PageLink pageLink, String pagingHref, NodeList<Element> allLinks) {
    // This logs the following to the console:
    // - number of links processed
    // - the next or previous page link found
    // - for each link: its href, text, concatenated debug string.
    // Location of logging output is different when running in different modes:
    // - "ant test.dev": test output file.
    // - chrome browser distiller viewer: chrome logfile.
    // (TODO)kuan): investigate how to get logging when running "ant test.prod" - currently,
    // nothing appears.  In the meantime, throwing an exception with a log message at suspicious
    // codepoints can produce a call stack and help debugging, albeit tediously.
    LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found "
            + (pageLink == PageLink.NEXT ? "next: " : "prev: ") + (pagingHref != null ? pagingHref : "null"));

    for (int i = 0; i < allLinks.getLength(); i++) {
        AnchorElement link = AnchorElement.as(allLinks.getItem(i));

        // Use javascript innerText (instead of javascript textContent) to get only visible
        // text./* w  ww. ja  v  a 2 s . c  o m*/
        String text = DomUtil.getInnerText(link);
        // Trim unnecessary whitespaces from text.
        String[] words = StringUtil.split(text, "\\s+");
        text = "";
        for (int w = 0; w < words.length; w++) {
            text += words[w];
            if (w < words.length - 1)
                text += " ";
        }

        LogUtil.logToConsole(
                i + ")" + link.getHref() + ", txt=[" + text + "], dbg=[" + mLinkDebugInfo.get(link) + "]");
    }
}

From source file:com.dom_distiller.client.SchemaOrgParser.java

License:Open Source License

private void getElementsWithItemAttribute(Element e, List<Element> allProp) {
    if (e.hasAttribute("ITEMPROP") || e.hasAttribute("ITEMSCOPE")) {
        allProp.add(e);/*w  ww  . j  a v a 2s . c  o  m*/
    }
    NodeList<Node> children = e.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
        Node child = children.getItem(i);
        if (child.getNodeType() != Node.ELEMENT_NODE)
            continue;
        getElementsWithItemAttribute(Element.as(child), allProp);
    }
}

From source file:com.dom_distiller.client.SchemaOrgParser.java

License:Open Source License

private void getAuthorElements(Element e, List<Element> allAuthors) {
    String tagName = e.getTagName();
    if ((tagName.equalsIgnoreCase("A") || tagName.equalsIgnoreCase("LINK"))
            && e.getAttribute("REL").equalsIgnoreCase(AUTHOR_REL)) {
        allAuthors.add(e);/*from w  w w. j a  v a 2s  . com*/
    }
    NodeList<Node> children = e.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
        Node child = children.getItem(i);
        if (child.getNodeType() != Node.ELEMENT_NODE)
            continue;
        getAuthorElements(Element.as(child), allAuthors);
    }
}