Example usage for org.jsoup.nodes Element tagName

List of usage examples for org.jsoup.nodes Element tagName

Introduction

In this page you can find the example usage for org.jsoup.nodes Element tagName.

Prototype

public String tagName() 

Source Link

Document

Get the name of the tag for this element.

Usage

From source file:org.aliuge.crawler.jobconf.ExtractConfig.java

/**
 * ????/*from w  w  w .  jav  a2  s. com*/
 * @param doc
 * @return
 * @throws ConfigurationException
 */
public ExtractConfig loadConfig(Document doc) {
    Elements extractElement = doc.select("extract");
    super.setJobName(doc.select("job").attr("name"));
    super.setIndexName(doc.select("job").attr("indexName"));
    String temp = extractElement.select("threadNum").text();
    if (StringUtils.isNotBlank(temp)) {
        this.threadNum = Integer.parseInt(temp);
    }

    Elements templateElement = extractElement.select("extract").select("template");
    Iterator<Element> it = templateElement.iterator();

    while (it.hasNext()) {
        Element template = it.next();
        ExtractTemplate extractTemplate = new ExtractTemplate();
        // ?Url????
        Elements urlPatternElement = template.select("url");
        List<Pattern> patterns = Lists.newArrayList();
        for (Element urlElement : urlPatternElement) {
            patterns.add(Pattern.compile(urlElement.text()));
        }
        extractTemplate.setUrlPattern(patterns);
        extractTemplate.setName(template.attr("name"));
        // ???
        Elements selectElement = template.select("elements").first().children();
        for (Element element : selectElement) {
            if ("element".equals(element.tagName())) {
                AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element);
                extractTemplate.addCssSelector(selector);
            } else if ("if".equals(element.tagName())) {
                IFConditions ifConditions = IFConditions.create(element);
                extractTemplate.addConditions(ifConditions);
            }
        }
        super.setExtractConfig(this);
        this.templates.add(extractTemplate);
    }
    //super.setExtractConfig(this);
    return this;
}

From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java

private String convertNodeToText(HTMLNode htmlNode) {
    Node node = htmlNode.underlyingNode;
    if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        return textNode.getWholeText();
    }/*from ww w .j  a va  2s . c o m*/
    if (node instanceof Element) {
        Element element = (Element) node;
        if (element.tagName().equals(BR_TAG)) {
            return "\n";
        }
        if (isList(element)) {
            return convertListElement(htmlNode.listNestedLevel);
        }
        if (element.tagName().equals(OL_TAG)) {
            return "\n\n";
        }
        if (element.tagName().equals(LI_TAG)) {
            return "\n" + StringUtils.repeat(" ", htmlNode.listNestedLevel) + "- ";
        }
        if (element.tagName().equals(P_TAG)) {
            return "\n\n";
        }
        if (element.tagName().equals(IMG_TAG)) {
            return generateImageAlternativeText(element);
        }
    }
    return "";
}

From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java

private boolean isList(Element element) {
    return element.tagName().equals(UL_TAG) || element.tagName().equals(OL_TAG);
}

From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java

private Position getPosition(Node node) {
    if (node instanceof Element) {
        Element element = (Element) node;
        if (element.tagName().equals(LI_TAG)) {
            return Position.PREFIX;
        }/*from   w ww. jav a 2  s.com*/
    }
    return Position.SUFFIX;
}

From source file:org.asqatasun.processing.ProcessRemarkServiceImpl.java

/**
 * This methods search the line where the current node is present in
 * the source code//from   w  w  w. ja v a2s .  c  om
 * @param node
 * @return
 */
private int getElementIndex(Element element) {
    Elements elements = jsoupDocument.getElementsByTag(element.tagName());
    for (int i = 0; i < elements.size(); i++) {
        Element current = elements.get(i);
        if (current.equals(element)) {
            return i;
        }
    }
    return -1;
}

From source file:org.asqatasun.processing.ProcessRemarkServiceImpl.java

/**
 * //from www .  j a va2 s .  c o m
 * @param element
 * @param originalElementHtml
 * @param truncatedElementHtml
 * @return 
 */
private String properlyCloseSnippet(Element element, String originalElementHtml, String truncatedElementHtml) {
    if (isElementAutoClose(originalElementHtml)) {
        return originalElementHtml;
    }

    if (getElementCurrentlyOpenCount(truncatedElementHtml) == 1) {
        return closeInnerElement(originalElementHtml, truncatedElementHtml);
    } else if (getElementCurrentlyOpenCount(truncatedElementHtml) > 1) {
        truncatedElementHtml = closeInnerElement(originalElementHtml, truncatedElementHtml);
        return closeElement(truncatedElementHtml, element.tagName());
    } else {
        return closeElement(truncatedElementHtml, element.tagName());
    }
}

From source file:org.asqatasun.rules.accessiweb22.Aw22Rule10071.java

/**
 * //from   w w  w . j a  va 2 s. c  o  m
 * @param element
 * @param domElement
 * @param elementHandler 
 */
private void treatFocusableElement(Element element, DomElement domElement,
        ElementHandler<Element> elementHandler) {
    if (element == null) {
        return;
    }
    nbOfFocusableElements++;
    if (getFocusableElementExcludedList().contains(element.tagName())) {
        focusableElementExcluded = true;
    } else if (!isOutlineVisible(domElement)) {
        if (!StringUtils.equalsIgnoreCase(element.tagName(), BODY_ELEMENT)) {
            elementHandler.add(element);
        }
    }
}

From source file:org.asqatasun.rules.elementchecker.lang.LangChecker.java

/**
 * //ww  w  .  j av a 2 s  .c  o  m
 * @param element
 * @param extractRecursively
 * @return 
 */
protected String extractTextFromElement(Element element, boolean extractRecursively) {
    if (EXCLUDED_ELEMENTS_LIST.contains(element.tagName())) {
        return null;
    }
    StringBuilder strb = new StringBuilder();
    if (testableTextElementBuilder == null) {
        testableTextElementBuilder = new CompleteTextElementBuilder();
    }
    strb.append(testableTextElementBuilder.buildTextFromElement(element));

    if (extractRecursively) {
        for (Element el : element.children()) {
            if (!isLangDefinedForElement(el) && !EXCLUDED_ELEMENTS_LIST.contains(el.tagName())) {
                strb.append(TextElementBuilder.SPACER);
                strb.append(extractTextFromElement(el, true));
            }
        }
    }
    return strb.toString().replaceAll(" +", " ");
}

From source file:org.asqatasun.rules.elementselector.LinkElementSelector.java

/**
 * /*ww  w.j  a v  a 2  s .  c  om*/
 * @param linkElement
 * @param linkText
 * @return whether the current link have a context
 */
protected boolean doesLinkHaveContext(Element linkElement, String linkText) {
    // does the current link have a title attribute? 
    if (considerTitleAsContext && linkElement.hasAttr(TITLE_ATTR)
            && !StringUtils.equalsIgnoreCase(linkElement.attr(TITLE_ATTR), linkText)) {
        return true;
    }
    if (linkElement.hasAttr(ARIA_LABEL_ATTR) && StringUtils.isNotBlank(linkElement.attr(ARIA_LABEL_ATTR))) {
        return true;
    }
    if (linkElement.hasAttr(ARIA_LABELLEDBY_ATTR)
            && StringUtils.isNotBlank(linkElement.attr(ARIA_LABELLEDBY_ATTR))) {
        return true;
    }
    // does the parent of the current link have some text?
    if (StringUtils.isNotBlank(linkElement.parent().ownText())) {
        return true;
    }
    // does the current element have a previous sibling of heading type?
    if (isOneOfPrecedingSiblingofHeadingType(linkElement)) {
        return true;
    }
    // does one of the parent of the current element have a previous sibling 
    // of heading type or is found in the PARENT_CONTEXT_ELEMENTS list?
    for (Element parent : linkElement.parents()) {
        if (PARENT_CONTEXT_ELEMENTS.contains(parent.tagName())
                || isOneOfPrecedingSiblingofHeadingType(parent)) {
            return true;
        }
    }
    return false;
}

From source file:org.asqatasun.rules.elementselector.LinkElementSelector.java

/**
 * //from w  w w  .  j  a  va2  s  .  co  m
 * @param element
 * @return whether one of the preceding sibling is of heading type
 */
private boolean isOneOfPrecedingSiblingofHeadingType(Element element) {
    Element prevElementSibling = element.previousElementSibling();
    while (prevElementSibling != null) {
        if (PREV_SIBLING_CONTEXT_ELEMENTS.contains(prevElementSibling.tagName())
                || !prevElementSibling.select(CssLikeQueryStore.HEADINGS_CSS_LIKE_QUERY).isEmpty()) {
            return true;
        }
        prevElementSibling = prevElementSibling.previousElementSibling();
    }
    return false;
}