List of usage examples for org.jsoup.nodes Element tagName
public String tagName()
From source file:org.aliuge.crawler.jobconf.ExtractConfig.java
/** * ????/*from w w w . jav a2 s. com*/ * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) { Elements extractElement = doc.select("extract"); super.setJobName(doc.select("job").attr("name")); super.setIndexName(doc.select("job").attr("indexName")); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } super.setExtractConfig(this); this.templates.add(extractTemplate); } //super.setExtractConfig(this); return this; }
From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java
private String convertNodeToText(HTMLNode htmlNode) { Node node = htmlNode.underlyingNode; if (node instanceof TextNode) { TextNode textNode = (TextNode) node; return textNode.getWholeText(); }/*from ww w .j a va 2s . c o m*/ if (node instanceof Element) { Element element = (Element) node; if (element.tagName().equals(BR_TAG)) { return "\n"; } if (isList(element)) { return convertListElement(htmlNode.listNestedLevel); } if (element.tagName().equals(OL_TAG)) { return "\n\n"; } if (element.tagName().equals(LI_TAG)) { return "\n" + StringUtils.repeat(" ", htmlNode.listNestedLevel) + "- "; } if (element.tagName().equals(P_TAG)) { return "\n\n"; } if (element.tagName().equals(IMG_TAG)) { return generateImageAlternativeText(element); } } return ""; }
From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java
private boolean isList(Element element) { return element.tagName().equals(UL_TAG) || element.tagName().equals(OL_TAG); }
From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java
private Position getPosition(Node node) { if (node instanceof Element) { Element element = (Element) node; if (element.tagName().equals(LI_TAG)) { return Position.PREFIX; }/*from w ww. jav a 2 s.com*/ } return Position.SUFFIX; }
From source file:org.asqatasun.processing.ProcessRemarkServiceImpl.java
/** * This methods search the line where the current node is present in * the source code//from w w w. ja v a2s . c om * @param node * @return */ private int getElementIndex(Element element) { Elements elements = jsoupDocument.getElementsByTag(element.tagName()); for (int i = 0; i < elements.size(); i++) { Element current = elements.get(i); if (current.equals(element)) { return i; } } return -1; }
From source file:org.asqatasun.processing.ProcessRemarkServiceImpl.java
/** * //from www . j a va2 s . c o m * @param element * @param originalElementHtml * @param truncatedElementHtml * @return */ private String properlyCloseSnippet(Element element, String originalElementHtml, String truncatedElementHtml) { if (isElementAutoClose(originalElementHtml)) { return originalElementHtml; } if (getElementCurrentlyOpenCount(truncatedElementHtml) == 1) { return closeInnerElement(originalElementHtml, truncatedElementHtml); } else if (getElementCurrentlyOpenCount(truncatedElementHtml) > 1) { truncatedElementHtml = closeInnerElement(originalElementHtml, truncatedElementHtml); return closeElement(truncatedElementHtml, element.tagName()); } else { return closeElement(truncatedElementHtml, element.tagName()); } }
From source file:org.asqatasun.rules.accessiweb22.Aw22Rule10071.java
/** * //from w w w . j a va 2 s. c o m * @param element * @param domElement * @param elementHandler */ private void treatFocusableElement(Element element, DomElement domElement, ElementHandler<Element> elementHandler) { if (element == null) { return; } nbOfFocusableElements++; if (getFocusableElementExcludedList().contains(element.tagName())) { focusableElementExcluded = true; } else if (!isOutlineVisible(domElement)) { if (!StringUtils.equalsIgnoreCase(element.tagName(), BODY_ELEMENT)) { elementHandler.add(element); } } }
From source file:org.asqatasun.rules.elementchecker.lang.LangChecker.java
/** * //ww w . j av a 2 s .c o m * @param element * @param extractRecursively * @return */ protected String extractTextFromElement(Element element, boolean extractRecursively) { if (EXCLUDED_ELEMENTS_LIST.contains(element.tagName())) { return null; } StringBuilder strb = new StringBuilder(); if (testableTextElementBuilder == null) { testableTextElementBuilder = new CompleteTextElementBuilder(); } strb.append(testableTextElementBuilder.buildTextFromElement(element)); if (extractRecursively) { for (Element el : element.children()) { if (!isLangDefinedForElement(el) && !EXCLUDED_ELEMENTS_LIST.contains(el.tagName())) { strb.append(TextElementBuilder.SPACER); strb.append(extractTextFromElement(el, true)); } } } return strb.toString().replaceAll(" +", " "); }
From source file:org.asqatasun.rules.elementselector.LinkElementSelector.java
/** * /*ww w.j a v a 2 s . c om*/ * @param linkElement * @param linkText * @return whether the current link have a context */ protected boolean doesLinkHaveContext(Element linkElement, String linkText) { // does the current link have a title attribute? if (considerTitleAsContext && linkElement.hasAttr(TITLE_ATTR) && !StringUtils.equalsIgnoreCase(linkElement.attr(TITLE_ATTR), linkText)) { return true; } if (linkElement.hasAttr(ARIA_LABEL_ATTR) && StringUtils.isNotBlank(linkElement.attr(ARIA_LABEL_ATTR))) { return true; } if (linkElement.hasAttr(ARIA_LABELLEDBY_ATTR) && StringUtils.isNotBlank(linkElement.attr(ARIA_LABELLEDBY_ATTR))) { return true; } // does the parent of the current link have some text? if (StringUtils.isNotBlank(linkElement.parent().ownText())) { return true; } // does the current element have a previous sibling of heading type? if (isOneOfPrecedingSiblingofHeadingType(linkElement)) { return true; } // does one of the parent of the current element have a previous sibling // of heading type or is found in the PARENT_CONTEXT_ELEMENTS list? for (Element parent : linkElement.parents()) { if (PARENT_CONTEXT_ELEMENTS.contains(parent.tagName()) || isOneOfPrecedingSiblingofHeadingType(parent)) { return true; } } return false; }
From source file:org.asqatasun.rules.elementselector.LinkElementSelector.java
/** * //from w w w . j a va2 s . co m * @param element * @return whether one of the preceding sibling is of heading type */ private boolean isOneOfPrecedingSiblingofHeadingType(Element element) { Element prevElementSibling = element.previousElementSibling(); while (prevElementSibling != null) { if (PREV_SIBLING_CONTEXT_ELEMENTS.contains(prevElementSibling.tagName()) || !prevElementSibling.select(CssLikeQueryStore.HEADINGS_CSS_LIKE_QUERY).isEmpty()) { return true; } prevElementSibling = prevElementSibling.previousElementSibling(); } return false; }