Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:org.apdplat.extractor.html.HtmlExtractor.java

/**
 * ????/*  w ww  .ja  v  a  2s.com*/
 * @param url html?
 * @param htmlTemplate html??
 * @param doc jsoup
 * @return ?
 */
private ExtractResult extractHtmlTemplate(String url, HtmlTemplate htmlTemplate, Document doc) {
    //???
    ExtractResult extractResult = new ExtractResult();
    extractResult.setUrl(url);
    extractResult.setTableName(htmlTemplate.getTableName());
    List<CssPath> cssPaths = htmlTemplate.getCssPaths();
    //??CSS???????
    //??CSS???
    for (CssPath cssPath : cssPaths) {
        // ??CSS PATH
        Elements elements = doc.select(cssPath.getCssPath());
        // CSS??
        for (Element element : elements) {
            String text = null;
            if (StringUtils.isBlank(cssPath.getAttr())) {
                //???
                text = element.text();
            } else {
                //???
                text = element.attr(cssPath.getAttr());
            }
            if (StringUtils.isNotBlank(text)) {
                // ????
                if (cssPath.hasExtractFunction()) {
                    //CSS???
                    for (ExtractFunction pf : cssPath.getExtractFunctions()) {
                        text = ExtractFunctionExecutor.execute(text, doc, cssPath, pf.getExtractExpression());
                        if (text != null) {
                            ExtractResultItem extractResultItem = new ExtractResultItem();
                            extractResultItem.setField(pf.getFieldName());
                            extractResultItem.setValue(text);
                            extractResult.addExtractResultItem(extractResultItem);
                        } else {
                            ExtractFailLog extractFailLog = new ExtractFailLog();
                            extractFailLog.setUrl(url);
                            extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                            extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                            extractFailLog.setCssPath(cssPath.getCssPath());
                            extractFailLog.setExtractExpression(pf.getExtractExpression());
                            extractFailLog.setTableName(htmlTemplate.getTableName());
                            extractFailLog.setFieldName(pf.getFieldName());
                            extractFailLog.setFieldDescription(pf.getFieldDescription());
                            extractResult.addExtractFailLog(extractFailLog);
                            //??????
                            //?
                            //???
                            return extractResult;
                        }
                    }
                } else {
                    //CSS?
                    ExtractResultItem extractResultItem = new ExtractResultItem();
                    extractResultItem.setField(cssPath.getFieldName());
                    extractResultItem.setValue(text);
                    extractResult.addExtractResultItem(extractResultItem);
                }
            } else {
                //??????
                ExtractFailLog extractFailLog = new ExtractFailLog();
                extractFailLog.setUrl(url);
                extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                extractFailLog.setCssPath(cssPath.getCssPath());
                extractFailLog.setExtractExpression("");
                extractFailLog.setTableName(htmlTemplate.getTableName());
                extractFailLog.setFieldName(cssPath.getFieldName());
                extractFailLog.setFieldDescription(cssPath.getFieldDescription());
                extractResult.addExtractFailLog(extractFailLog);
                //??????
                //?
                //???
                return extractResult;
            }
        }
    }
    return extractResult;
}

From source file:org.apdplat.extractor.html.impl.DefaultHtmlExtractor.java

/**
 * ??/*from   w  ww . j a v  a 2  s  .  co  m*/
 * @param url URL
 * @param html HTML
 * @return ?
 */
@Override
public List<ExtractResult> extract(String url, String html) {
    List<ExtractResult> extractResults = new ArrayList<>();
    //?URL???
    List<HtmlTemplate> htmlTemplates = extractRegular.getHtmlTemplate(url);
    if (htmlTemplates.isEmpty()) {
        LOGGER.debug("URL?" + url);
        return extractResults;
    }
    try {
        Document doc = Jsoup.parse(html);
        Elements metas = doc.select("meta");
        String keywords = "";
        String description = "";
        for (Element meta : metas) {
            String name = meta.attr("name");
            if ("keywords".equals(name)) {
                keywords = meta.attr("content");
            }
            if ("description".equals(name)) {
                description = meta.attr("content");
            }
        }
        Set<String> tableNames = new HashSet<>();
        for (HtmlTemplate htmlTemplate : htmlTemplates) {
            if (tableNames.contains(htmlTemplate.getTableName())) {
                LOGGER.debug(
                        "?tableName????UrlPattern?"
                                + htmlTemplate.getUrlPattern().getUrlPattern());
                LOGGER.debug(htmlTemplates.toString());
            }
            tableNames.add(htmlTemplate.getTableName());
            try {
                //???
                ExtractResult extractResult = extractHtmlTemplate(url, htmlTemplate, doc);
                //?URL???????
                if (!extractResult.getExtractFailLogs().isEmpty()
                        || !extractResult.getExtractResultItems().isEmpty()) {
                    extractResult.setContent(html.getBytes("utf-8"));
                    extractResult.setEncoding("utf-8");
                    extractResult.setKeywords(keywords);
                    extractResult.setDescription(description);
                    extractResults.add(extractResult);
                } else {
                    LOGGER.debug(url + " ? " + htmlTemplate.getTemplateName() + " ?");
                }
            } catch (Exception e) {
                LOGGER.error("???" + htmlTemplate.getTemplateName(), e);
            }
        }
    } catch (Exception e) {
        LOGGER.error("?: " + url, e);
    }
    return extractResults;
}

From source file:org.arb.extractor.DomTreeWalker.java

private void collectIdsOnElement(Element element, AbstractCodeUnit codeUnit) {
    if (element.hasAttr("id")) {
        elementIdSet.add(element.attr("id"));
    }/*from  w w  w .  j a v  a2  s.c  o m*/
    if (element.hasAttr("arb:id")) {
        arbIdSet.add(element.attr("arb:id"));
    }
    for (int i = 0; i < element.children().size(); i++) {
        collectIdsOnElement(element.child(i), codeUnit);
    }
}

From source file:org.arb.extractor.DomTreeWalker.java

/**
 * Get the existing resource id from the element. Resource id can be specified with either
 * arb:id or id, with arb:id taking priority.
 * //from   ww  w . java2  s.  c  om
 * @param elem the element to be checked.
 * @return resource id or null.
 */
private String getElementResourceId(Element elem) {
    if (elem.hasAttr("arb:id")) {
        return elem.attr("arb:id");
    }
    if (elem.hasAttr("id")) {
        return elem.attr("id");
    }
    return null;
}

From source file:org.asqatasun.contentadapter.css.CSSJsoupPhlocContentAdapterImpl.java

/**
 * Retrieve css content and adapt it for each inline resource
 *///w  w w .jav a 2  s. c  o m
private void adaptInlineCSS() {
    Set<Long> relatedCssIdSet = new HashSet<>();

    for (Element el : inlineCssElements) {
        String attributeValue = el.attr("style");
        if (StringUtils.isNotBlank(attributeValue)) {
            Resource cssResource = new CSSResourceImpl(el.nodeName() + "{" + attributeValue + "}", 0,
                    new InlineRsrc());
            StylesheetContent cssContent = getStylesheetFromInlineResource(cssResource.getResource());
            adaptContent(cssContent, cssResource, getCurrentResourcePath(el.baseUri()), null);
            relatedCssIdSet.add(getContentDataService().saveOrUpdate(cssContent).getId());
        }
    }
    getContentDataService().saveContentRelationShip(getSSP(), relatedCssIdSet);
}

From source file:org.asqatasun.contentadapter.css.CSSJsoupPhlocContentAdapterImpl.java

/**
 * Adapt the external css. /*from  www  .j  a v  a 2 s  .c o  m*/
 */
private void adaptExternalCss() {
    for (Element el : externalCssElements) {
        List<CSSMediaQuery> mediaList = getListOfMediaFromAttributeValue(el);
        String resourcePath = el.attr("abs:href");
        getExternalResourceAndAdapt(resourcePath, mediaList);
    }
    Set<Long> relatedCssIdSet = new HashSet<>();
    // At the end of the document we link each external css that are
    // already fetched and that have been encountered in the SSP to the SSP.
    LOGGER.debug("Found " + relatedExternalCssSet.size() + " external css in " + getSSP().getURI());
    for (StylesheetContent cssContent : relatedExternalCssSet) {
        if (cssContent.getAdaptedContent() == null) {
            cssContent.setAdaptedContent(CSS_ON_ERROR);
        }
        LOGGER.debug("Create relation between " + getSSP().getURI() + " and " + cssContent.getURI());
        // to avoid fatal error when persist weird sourceCode
        try {
            // the content is saved only when the id is null which means 
            // that the content hasn't been persisted yet. Otherwise, the
            // save is uneeded and the id is used to create the relation 
            // with the current SSP
            if (cssContent.getId() == null) {
                cssContent = (StylesheetContent) getContentDataService().saveOrUpdate(cssContent);
            }
            relatedCssIdSet.add(cssContent.getId());
        } catch (PersistenceException | DataException pe) {
            adaptedContentOnError(cssContent, relatedCssIdSet);
        }
    }
    getContentDataService().saveContentRelationShip(getSSP(), relatedCssIdSet);
}

From source file:org.asqatasun.contentadapter.css.CSSJsoupPhlocContentAdapterImpl.java

/**
 * Get the list of media from the media attribute content
 * @param mediaAttribute/*from  w w  w .  jav a 2  s. co  m*/
 * @return
 */
private List<CSSMediaQuery> getListOfMediaFromAttributeValue(Element element) {
    String mediaAttribute = element.attr("media");
    List<CSSMediaQuery> mediaTypeList = new ArrayList<>();
    if (mediaAttribute == null || StringUtils.isBlank(mediaAttribute)) {
        return mediaTypeList;
    } else {
        mediaTypeList.addAll(MediaQueryTools.parseToMediaQuery(mediaAttribute, CCharset.CHARSET_UTF_8_OBJ,
                ECSSVersion.CSS30));
    }
    return mediaTypeList;
}

From source file:org.asqatasun.processing.ProcessRemarkServiceImpl.java

@Override
public SourceCodeRemark createSourceCodeRemark(TestSolution processResult, Element element,
        String messageCode) {/*from  www.  j a v a  2  s . c  om*/

    SourceCodeRemark remark = processRemarkDataService.getSourceCodeRemark(element.nodeName(), processResult,
            messageCode, searchElementLineNumber(element));

    remark.setSnippet(getSnippetFromElement(element));
    for (String attr : evidenceElementList) {
        EvidenceElement evidenceElementSup;
        if (StringUtils.equalsIgnoreCase(attr, "text")) {
            evidenceElementSup = getEvidenceElement(attr, element.text());
        } else {
            evidenceElementSup = getEvidenceElement(attr, element.attr(attr));
        }
        remark.addElement(evidenceElementSup);
    }
    return remark;
}

From source file:org.asqatasun.ruleimplementation.AbstractMarkerPageRuleImplementation.java

/**
 * To sort marker elements, we extract for each of them the value of the
 * "id" attribute the value of the "class" attribute and the value of the
 * "role" attribute. If one of these three values belongs to the marker
 * value list set by the user, we consider that the element is characterised
 * and we add it to the "elementMarkerList".
 *
 * @param nodeList// w w  w  . j av a  2 s. com
 */
private void sortMarkerElements() {
    if ((CollectionUtils.isEmpty(markerList) && CollectionUtils.isEmpty(inverseMarkerList))
            || selectionWithoutMarkerHandler.isEmpty()) {
        return;
    }
    Iterator<Element> iter = selectionWithoutMarkerHandler.get().iterator();
    Element el;
    while (iter.hasNext()) {
        el = iter.next();
        String id = el.id();
        Collection<String> classNames = el.classNames();
        String role = el.attr(ROLE_ATTR);
        // if the element does contain an "id" OR a "class" attribute OR
        // a "role" attribute AND one the values belongs to the marker list, 
        // it is removed from the global selection and added to the 
        // marker element selection.
        if (StringUtils.isNotBlank(id) || CollectionUtils.isNotEmpty(classNames)
                || StringUtils.isNotBlank(role)) {
            if (checkAttributeBelongsToMarkerList(id, classNames, role, markerList)) {
                selectionWithMarkerHandler.add(el);
                iter.remove();
            }
            // if the element belongs to the inverse marker list, it is
            // removed from the global collection
            if (checkAttributeBelongsToMarkerList(id, classNames, role, inverseMarkerList)) {
                iter.remove();
            }
        }
    }
}

From source file:org.asqatasun.rules.accessiweb22.Aw22Rule06031.java

/**
 * /* w w  w. j  a v  a2s .  c om*/
 * @param sspHandler
 * @param el
 * @param linkText
 * @return 
 */
private TestSolution testTitleAttributeLink(SSPHandler sspHandler, Element el, String linkText) {
    // if the current has no title or has an empty title or has a title 
    // content identical to the link text, returns not applicable.
    if (!el.hasAttr(TITLE_ATTR)) {
        return TestSolution.NOT_APPLICABLE;
    }
    String attrValue = el.attr(TITLE_ATTR);
    if (StringUtils.isBlank(attrValue)) {
        return TestSolution.NOT_APPLICABLE;
    }
    if (StringUtils.equalsIgnoreCase(attrValue, linkText)) {
        return TestSolution.NOT_APPLICABLE;
    }
    ElementHandler<Element> elHandler = new ElementHandlerImpl(el);
    TestSolutionHandler tsHandler = new TestSolutionHandlerImpl();
    titlePertinenceElementChecker.check(sspHandler, elHandler, tsHandler);
    return tsHandler.getTestSolution();
}