List of usage examples for com.google.gwt.dom.client NodeList getItem
public T getItem(int index)
From source file:org.chromium.distiller.OpenGraphProtocolParser.java
License:Open Source License
private void parseMetaTags(Element root) { NodeList<Element> allMeta = null; if (doPrefixFiltering) { // Attribute selectors with prefix // https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors String query = ""; for (Map.Entry<Prefix, String> entry : mPrefixes.entrySet()) { query += "meta[property^=\"" + entry.getValue() + "\"],"; }/*from w ww .j av a 2s .c o m*/ query = query.substring(0, query.length() - 1); allMeta = DomUtil.querySelectorAll(root, query); } else { allMeta = DomUtil.querySelectorAll(root, "meta[property]"); } for (int i = 0; i < allMeta.getLength(); i++) { MetaElement meta = MetaElement.as(allMeta.getItem(i)); String property = meta.getAttribute("property").toLowerCase(); // Only store properties that we care about for distillation. for (int j = 0; j < mProperties.length; j++) { String prefixWithColon = mPrefixes.get(mProperties[j].mPrefix) + ":"; // Note that property.equals() won't work here because |mProperties| uses "image:" // (IMAGE_STRUCT_PROP_PFX) for all image structured properties, so as to prevent // repetitive property name comparison - here and then again in ImageParser. if (!property.startsWith(prefixWithColon + mProperties[j].mName)) continue; property = property.substring(prefixWithColon.length()); boolean addProperty = true; if (mProperties[j].mParser != null) { addProperty = mProperties[j].mParser.parse(property, meta.getContent(), mPropertyTable); } if (addProperty) mPropertyTable.put(mProperties[j].mName, meta.getContent()); } } }
From source file:org.chromium.distiller.PageParameterParser.java
License:Open Source License
/** * Actually implements PageParameterParser.parse(), see above description for parse(). *//* w w w . j av a 2 s . c om*/ private PageParamInfo parseDocument(Element root, String originalUrl) { double startTime = DomUtil.getTime(); mDocUrl = sHrefCleaner.replace(originalUrl, ""); mParsedUrl = ParsedUrl.create(mDocUrl); if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. AnchorElement baseAnchor = PagingLinksFinder .createAnchorWithBase(PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); NodeList<Element> allLinks = root.getElementsByTagName("A"); int idx = 0; while (idx < allLinks.getLength()) { final AnchorElement link = AnchorElement.as(allLinks.getItem(idx)); PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor); if (pageInfoAndText == null) { idx++; continue; } // This link is a good candidate for pagination. // Close current group of adjacent numbers, add a new group if necessary. mAdjacentNumbersGroups.addGroup(); // Before we append the link to the new group of adjacent numbers, check if it's // preceded by a text node with numeric text; if so, add it before the link. findAndAddClosestValidLeafNodes(link, false, true, null); // Add the link to the current group of adjacent numbers. mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); // Add all following text nodes and links with numeric text. mNumForwardLinksProcessed = 0; findAndAddClosestValidLeafNodes(link, false, false, baseAnchor); // Skip the current link and links already processed in the forward // findandAddClosestValidLeafNodes(). idx += 1 + mNumForwardLinksProcessed; } // while there're links. mAdjacentNumbersGroups.cleanup(); LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); startTime = DomUtil.getTime(); PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl); LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); return info; }
From source file:org.chromium.distiller.PagingLinksFinder.java
License:Open Source License
private static String findPagingLink(Element root, String original_url, PageLink pageLink) { // findPagingLink() is static, so clear mLinkDebugInfo before processing the links. if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { mLinkDebugInfo.clear();/*from w w w. j a v a 2 s . co m*/ } String folderUrl = StringUtil.findAndReplace(original_url, "\\/[^/]*$", ""); // Remove trailing '/' from window location href, because it'll be used to compare with // other href's whose trailing '/' are also removed. String wndLocationHref = StringUtil.findAndReplace(original_url, "\\/$", ""); NodeList<Element> allLinks = root.getElementsByTagName("A"); Set<PagingLinkObj> possiblePages = new HashSet<PagingLinkObj>(); Set<String> bannedUrls = new HashSet<String>(); AnchorElement baseAnchor = createAnchorWithBase(getBaseUrlForRelative(root, original_url)); // The trailing "/" is essential to ensure the whole hostname is matched, and not just the // prefix of the hostname. It also maintains the requirement of having a "path" in the URL. String allowedPrefix = getScheme(original_url) + "://" + getHostname(original_url) + "/"; RegExp regPrefixNum = RegExp.compile("^" + StringUtil.regexEscape(allowedPrefix) + ".*\\d", "i"); // Loop through all links, looking for hints that they may be next- or previous- page links. // Things like having "page" in their textContent, className or id, or being a child of a // node with a page-y className or id. // Also possible: levenshtein distance? longest common subsequence? // After we do that, assign each page a score. for (int i = 0; i < allLinks.getLength(); i++) { AnchorElement link = AnchorElement.as(allLinks.getItem(i)); // Note that AnchorElement.getHref() returns the absolute URI, so there's no need to // worry about relative links. String linkHref = resolveLinkHref(link, baseAnchor); if (pageLink == PageLink.NEXT) { if (!regPrefixNum.test(linkHref)) { appendDbgStrForLink(link, "ignored: not prefix + num"); continue; } } else if (pageLink == PageLink.PREV) { if (!linkHref.substring(0, allowedPrefix.length()).equalsIgnoreCase(allowedPrefix)) { appendDbgStrForLink(link, "ignored: prefix"); continue; } } int width = link.getOffsetWidth(); int height = link.getOffsetHeight(); if (width == 0 || height == 0) { appendDbgStrForLink(link, "ignored: sz=" + width + "x" + height); continue; } if (!DomUtil.isVisible(link)) { appendDbgStrForLink(link, "ignored: invisible"); continue; } // Remove url anchor and then trailing '/' from link's href. linkHref = REG_HREF_CLEANER.replace(linkHref, ""); appendDbgStrForLink(link, "-> " + linkHref); // Ignore page link that is the same as current window location. // If the page link is same as the folder URL: // - next page link: ignore it, since we would already have seen it. // - previous page link: don't ignore it, since some sites will simply have the same // folder URL for the first page. if (linkHref.equalsIgnoreCase(wndLocationHref) || (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(folderUrl))) { appendDbgStrForLink(link, "ignored: same as current or folder url " + folderUrl); continue; } // Use javascript innerText (instead of javascript textContent) to only get visible // text. String linkText = DomUtil.getInnerText(link); // If the linkText looks like it's not the next or previous page, skip it. if (linkText.length() > 25) { appendDbgStrForLink(link, "ignored: link text too long"); continue; } // If the linkText contains banned text, skip it, and also ban other anchors with the // same link URL. if (REG_EXTRANEOUS.test(linkText)) { appendDbgStrForLink(link, "ignored: one of extra"); bannedUrls.add(linkHref); continue; } // For next page link, if the initial part of the URL is identical to the folder URL, but // the rest of it doesn't contain any digits, it's certainly not a next page link. // However, this doesn't apply to previous page link, because most sites will just have // the folder URL for the first page. // TODO(kuan): do we need to apply this heuristic to previous page links if current page // number is not 2? if (pageLink == PageLink.NEXT) { String linkHrefRemaining = linkHref; if (linkHref.startsWith(folderUrl)) { linkHrefRemaining = linkHref.substring(folderUrl.length()); } if (!REG_NUMBER.test(linkHrefRemaining)) { appendDbgStrForLink(link, "ignored: no number beyond folder url " + folderUrl); continue; } } PagingLinkObj linkObj = null; linkObj = new PagingLinkObj(i, 0, linkText, linkHref); possiblePages.add(linkObj); // If the folder URL isn't part of this URL, penalize this link. It could still be the // link, but the odds are lower. // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html. if (linkHref.indexOf(folderUrl) != 0) { linkObj.mScore -= 25; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": not part of folder url " + folderUrl); } // Concatenate the link text with class name and id, and determine the score based on // existence of various paging-related words. String linkData = linkText + " " + link.getClassName() + " " + link.getId(); appendDbgStrForLink(link, "txt+class+id=" + linkData); if (pageLink == PageLink.NEXT ? REG_NEXT_LINK.test(linkData) : REG_PREV_LINK.test(linkData)) { linkObj.mScore += 50; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has " + (pageLink == PageLink.NEXT ? "next" : "prev" + " regex")); } if (REG_PAGINATION.test(linkData)) { linkObj.mScore += 25; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has pag* word"); } if (REG_FIRST_LAST.test(linkData)) { // -65 is enough to negate any bonuses gotten from a > or in the text. // If we already matched on "next", last is probably fine. // If we didn't, then it's bad. Penalize. // Same for "prev". if ((pageLink == PageLink.NEXT && !REG_NEXT_LINK.test(linkObj.mLinkText)) || (pageLink == PageLink.PREV && !REG_PREV_LINK.test(linkObj.mLinkText))) { linkObj.mScore -= 65; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has first|last but no " + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex"); } } if (REG_NEGATIVE.test(linkData) || REG_EXTRANEOUS.test(linkData)) { linkObj.mScore -= 50; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has neg or extra regex"); } if (pageLink == PageLink.NEXT ? REG_PREV_LINK.test(linkData) : REG_NEXT_LINK.test(linkData)) { linkObj.mScore -= 200; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has opp of " + (pageLink == PageLink.NEXT ? "next" : "prev") + " regex"); } // Check if a parent element contains page or paging or paginate. boolean positiveMatch = false, negativeMatch = false; Element parent = link.getParentElement(); while (parent != null && (positiveMatch == false || negativeMatch == false)) { String parentClassAndId = parent.getClassName() + " " + parent.getId(); if (!positiveMatch && REG_PAGINATION.test(parentClassAndId)) { linkObj.mScore += 25; positiveMatch = true; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": posParent - " + parentClassAndId); } // TODO(kuan): to get 1st page for prev page link, this can't be applied; however, // the non-application might be the cause of recursive prev page being returned, // i.e. for page 1, it may incorrectly return page 3 for prev page link. if (!negativeMatch && REG_NEGATIVE.test(parentClassAndId)) { // If this is just something like "footer", give it a negative. // If it's something like "body-and-footer", leave it be. if (!REG_POSITIVE.test(parentClassAndId)) { linkObj.mScore -= 25; negativeMatch = true; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": negParent - " + parentClassAndId); } } parent = parent.getParentElement(); } // If the URL looks like it has paging in it, add to the score. // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34. if (REG_LINK_PAGINATION.test(linkHref) || REG_PAGINATION.test(linkHref)) { linkObj.mScore += 25; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has paging info"); } // If the URL contains negative values, give a slight decrease. if (REG_EXTRANEOUS.test(linkHref)) { linkObj.mScore -= 15; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": has extra regex"); } // If the link text is too long, penalize the link. if (linkText.length() > 10) { linkObj.mScore -= linkText.length(); appendDbgStrForLink(link, "score=" + linkObj.mScore + ": text too long"); } // If the link text can be parsed as a number, give it a minor bonus, with a slight bias // towards lower numbered pages. This is so that pages that might not have 'next' in // their text can still get scored, and sorted properly by score. // TODO(kuan): it might be wrong to assume that it knows about other pages in the // document and that it starts on the first page. int linkTextAsNumber = JavaScript.parseInt(linkText, 10); if (linkTextAsNumber > 0) { // Punish 1 since we're either already there, or it's probably before what we // want anyway. if (linkTextAsNumber == 1) { linkObj.mScore -= 10; } else { linkObj.mScore += Math.max(0, 10 - linkTextAsNumber); } appendDbgStrForLink(link, "score=" + linkObj.mScore + ": linktxt is a num (" + linkTextAsNumber + ")"); } Integer diff = pageDiff(original_url, linkHref, link, allowedPrefix.length()); if (diff != null) { if (((pageLink == PageLink.NEXT) && (diff == 1)) || ((pageLink == PageLink.PREV) && (diff == -1))) { linkObj.mScore += 25; appendDbgStrForLink(link, "score=" + linkObj.mScore + ": diff = " + diff); } } } // for all links // Loop through all of the possible pages from above and find the top candidate for the next // page URL. Require at least a score of 50, which is a relatively high confidence that // this page is the next link. PagingLinkObj topPage = null; if (!possiblePages.isEmpty()) { for (PagingLinkObj pageObj : possiblePages) { if (bannedUrls.contains(pageObj.mLinkHref)) { continue; } if (pageObj.mScore >= 50 && (topPage == null || topPage.mScore < pageObj.mScore)) { topPage = pageObj; } } } String pagingHref = null; if (topPage != null) { pagingHref = StringUtil.findAndReplace(topPage.mLinkHref, "\\/$", ""); appendDbgStrForLink(allLinks.getItem(topPage.mLinkIndex), "found: score=" + topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pagingHref); } if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { logDbgInfoToConsole(pageLink, pagingHref, allLinks); } return pagingHref; }
From source file:org.chromium.distiller.PagingLinksFinder.java
License:Open Source License
public static String getBaseUrlForRelative(Element root, String original_url) { NodeList<Element> bases = root.getElementsByTagName("BASE"); if (bases.getLength() == 0) { return original_url; }//from w ww . java2 s .c o m // Note that base.href can also be relative. // If multiple <base> elements are specified, only the first href and // first target value are used; all others are ignored. // Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base AnchorElement baseAnchor = createAnchorWithBase(original_url); return resolveLinkHref(BaseElement.as(bases.getItem(0)).getAttribute("href"), baseAnchor); }
From source file:org.chromium.distiller.PagingLinksFinder.java
License:Open Source License
private static void logDbgInfoToConsole(PageLink pageLink, String pagingHref, NodeList<Element> allLinks) { // This logs the following to the console: // - number of links processed // - the next or previous page link found // - for each link: its href, text, concatenated debug string. // Location of logging output is different when running in different modes: // - "ant test.dev": test output file. // - chrome browser distiller viewer: chrome logfile. // (TODO)kuan): investigate how to get logging when running "ant test.prod" - currently, // nothing appears. In the meantime, throwing an exception with a log message at suspicious // codepoints can produce a call stack and help debugging, albeit tediously. LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found " + (pageLink == PageLink.NEXT ? "next: " : "prev: ") + (pagingHref != null ? pagingHref : "null")); for (int i = 0; i < allLinks.getLength(); i++) { AnchorElement link = AnchorElement.as(allLinks.getItem(i)); // Use javascript innerText (instead of javascript textContent) to get only visible // text.//from w ww . j av a 2s . c om String text = DomUtil.getInnerText(link); // Trim unnecessary white spaces from text. String[] words = StringUtil.split(text, "\\s+"); text = ""; for (int w = 0; w < words.length; w++) { text += words[w]; if (w < words.length - 1) text += " "; } LogUtil.logToConsole( i + ")" + link.getHref() + ", txt=[" + text + "], dbg=[" + mLinkDebugInfo.get(link) + "]"); } }
From source file:org.chromium.distiller.SchemaOrgParser.java
License:Open Source License
private void parse(Element root) { NodeList<Element> allProp = DomUtil.querySelectorAll(root, "[ITEMPROP],[ITEMSCOPE]"); // Root node (html) is not included in the result of querySelectorAll, so need to // handle it explicitly here. parseElement(root, null);// w ww. j a v a 2 s.co m for (int i = 0; i < allProp.getLength(); i++) { Element e = allProp.getItem(i); parseElement(e, getItemScopeParent(e)); } // As per http://schema.org/author (or http://schema.org/Article and search for "author" // property), if <a> or <link> tags specify rel="author", extract it. allProp = DomUtil.querySelectorAll(root, "A[rel=author],LINK[rel=author]"); for (int i = 0; i < allProp.getLength(); i++) { Element e = allProp.getItem(i); if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute(e); } }
From source file:org.chromium.distiller.TableClassifier.java
License:Open Source License
private static List<Element> getDirectDescendants(Element t) { List<Element> directDescendants = new ArrayList<Element>(); NodeList<Element> allDescendants = t.getElementsByTagName("*"); if (!hasNestedTables(t)) { for (int i = 0; i < allDescendants.getLength(); i++) { directDescendants.add(allDescendants.getItem(i)); }/*from ww w. j a v a2 s. c o m*/ } else { for (int i = 0; i < allDescendants.getLength(); i++) { // Check if the current element is a direct descendant of the |t| table element in // question, as opposed to being a descendant of a nested table in |t|. Element e = allDescendants.getItem(i); Element parent = e.getParentElement(); while (parent != null) { if (parent.hasTagName("TABLE")) { if (parent == t) directDescendants.add(e); break; } parent = parent.getParentElement(); } } } return directDescendants; }
From source file:org.cruxframework.crux.core.client.utils.ScriptTagHandler.java
License:Apache License
/** * Evaluates any script inserted on the given element using element.innerHTML. * @param element/*from www . j av a 2 s . c o m*/ */ public static void evaluateScripts(Element element, ScriptLoadCallback callback) { if (scripts == null) { scripts = new ArrayList<Element>(); } NodeList<Element> scriptElements = element.getElementsByTagName("script"); if (scriptElements != null) { for (int i = 0; i < scriptElements.getLength(); i++) { scripts.add(scriptElements.getItem(i)); } } processNextScript(callback); }
From source file:org.cruxframework.crux.widgets.client.grid.GridFlexTable.java
License:Apache License
public void joinCells(int row) { TableRowElement tr = this.getRowElement(row).cast(); NodeList<TableCellElement> cells = tr.getCells(); int numTds = cells.getLength(); if (numTds > 1) { for (int i = 1; i < numTds; i++) { // We always remove the second cell. // This is because we want to keep the first one // and the cell indexes are in movement due // to the removing process. tr.removeChild(cells.getItem(1)); }/*from w ww. j a v a2s . com*/ TableCellElement td = this.getCellElement(row, 0).cast(); td.setAttribute("colSpan", "" + numTds); td.setInnerHTML("."); } }
From source file:org.dashbuilder.renderer.c3.client.charts.area.C3AreaChartView.java
License:Apache License
@Override public void fixAreaOpacity() { // This is a workaround for: https://github.com/c3js/c3/issues/2551 if (chart != null) { NodeList<Element> paths = chart.getElement().getElementsByTagName("path"); int n = paths.getLength(); for (int i = 0; i < n; i++) { Element child = paths.getItem(i); String className = child.getAttribute("class"); if (className != null && className.contains("c3-area-")) { child.getStyle().setOpacity(0.2); }/*from w w w.j a v a2 s . c o m*/ } } }