Example usage for org.jsoup.nodes Element attributes

List of usage examples for org.jsoup.nodes Element attributes

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attributes.

Prototype

Attributes attributes

To view the source code for org.jsoup.nodes Element attributes.

Click Source Link

Usage

From source file:com.kingfong.webcrawler.util.DOMContentUtils.java

/**
 * This method finds all anchors below the supplied DOM
 * <code>node</code>, and creates appropriate {@link Outlink}
 * records for each (relative to the supplied <code>base</code>
 * URL), and adds them to the <code>outlinks</code> {@link
 * ArrayList}.//from  w  w w.j  a va2s.c o  m
 *
 * <p>
 *
 * Links without inner structure (tags, text, etc) are discarded, as
 * are links which contain only single nested links and empty text
 * nodes (this is a common DOM-fixup artifact, at least with
 * nekohtml).
 */
public void getOutlinks(String html, URL url, HashSet<String> outlinks) {

    Document document = Jsoup.parse(html);
    Elements elements = document.getAllElements();
    for (Element currentNode : elements) {
        String nodeName = currentNode.tagName();
        // short nodeType = currentNode.;
        Elements children = currentNode.children();
        nodeName = nodeName.toLowerCase();
        LinkParams params = linkParams.get(nodeName);
        if (params != null) {
            // if (!shouldThrowAwayLink(currentNode, children, childLen,
            // params)) {

            // StringBuilder linkText = new StringBuilder();
            // getText(linkText, currentNode, true);

            Attributes attrs = currentNode.attributes();
            String target = null;
            boolean noFollow = false;
            boolean post = false;
            Iterator<Attribute> iterator = attrs.iterator();
            while (iterator.hasNext()) {
                Attribute attr = iterator.next();
                String attrName = attr.getKey();
                if (params.attrName.equalsIgnoreCase(attrName)) {
                    target = attr.getValue();
                } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getValue())) {
                    noFollow = true;
                } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getValue())) {
                    post = true;
                }
            }
            if (StringUtils.startsWith(target, "/")) {
                target = url.getProtocol() + "://" + url.getHost() + target;
            }
            if (target != null && URLFilter.filt(target)) {
                outlinks.add(target);
            }
            // }
            // this should not have any children, skip them
            if (params.childLen == 0)
                continue;
        }
    }
}

From source file:com.iorga.iraj.servlet.AgglomeratorServlet.java

private long searchAndAppendAfter(final ServletConfig config, final Element agglomerateElement,
        final String scriptSrc, final String pathPrefix, final String pathSuffix, final String urlAttribute,
        long lastModified) throws MalformedURLException, IOException, URISyntaxException {
    if (mode == Mode.DEVELOPMENT) {
        // add a watch for that directory
        final Path path = Paths.get(config.getServletContext().getRealPath(scriptSrc));
        path.register(watchService, StandardWatchEventKinds.ENTRY_CREATE, StandardWatchEventKinds.ENTRY_DELETE);
    }//from  w w w  . j  a v a 2 s  .c om
    final Set<String> childrenPaths = config.getServletContext().getResourcePaths(scriptSrc);
    for (final String path : childrenPaths) {
        if (path.endsWith(pathSuffix)) {
            // add that JS
            final StringBuilder targetScript = new StringBuilder("<");
            targetScript.append(agglomerateElement.tagName());
            // copy all the origin attributes
            for (final Attribute attribute : agglomerateElement.attributes()) {
                final String key = attribute.getKey();
                if (!ATTRIBUTE_NAME.equalsIgnoreCase(key) && !urlAttribute.equalsIgnoreCase(key)
                        && !URL_ATTRIBUTE_ATTRIBUTE_NAME.equalsIgnoreCase(key)) {
                    targetScript.append(" ").append(attribute.html());
                }
            }
            // specify the src path
            final String childUrl = StringUtils.removeStart(path, pathPrefix);
            targetScript.append(" ").append(new Attribute(urlAttribute, childUrl).html()).append(" />");
            agglomerateElement.after(targetScript.toString());
            lastModified = Math.max(
                    config.getServletContext().getResource(childUrl).openConnection().getLastModified(),
                    lastModified);
        } else if (path.endsWith("/")) {
            // it's a directory, recurse search & append
            lastModified = Math.max(searchAndAppendAfter(config, agglomerateElement, path, pathPrefix,
                    pathSuffix, urlAttribute, lastModified), lastModified);
        }
    }
    return lastModified;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * pulls out videos we like/* www  .j  a  v a  2s  .c om*/
 *
 * @return
 */
private ArrayList<Element> extractVideos(Element node) {
    ArrayList<Element> candidates = new ArrayList<Element>();
    ArrayList<Element> goodMovies = new ArrayList<Element>();
    try {

        Elements embeds = node.parent().getElementsByTag("embed");
        for (Element el : embeds) {
            candidates.add(el);
        }
        Elements objects = node.parent().getElementsByTag("object");
        for (Element el : objects) {
            candidates.add(el);
        }
        if (logger.isDebugEnabled()) {
            logger.debug("extractVideos: Starting to extract videos. Found: " + candidates.size());
        }

        for (Element el : candidates) {

            Attributes attrs = el.attributes();

            for (Attribute a : attrs) {
                try {
                    if (logger.isDebugEnabled()) {
                        logger.debug(a.getKey() + " : " + a.getValue());
                    }
                    if ((a.getValue().contains("youtube") || a.getValue().contains("vimeo"))
                            && a.getKey().equals("src")) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("Found video... setting");
                            logger.debug("This page has a video!: " + a.getValue());
                        }
                        goodMovies.add(el);

                    }
                } catch (Exception e) {
                    logger.error(e.toString());
                    e.printStackTrace();
                }
            }

        }
    } catch (NullPointerException e) {
        logger.error(e.toString(), e);
    } catch (Exception e) {
        logger.error(e.toString(), e);
    }
    if (logger.isDebugEnabled()) {
        logger.debug("extractVideos:  done looking videos");
    }
    return goodMovies;
}

From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java

private static Element createSafeElement(Element sourceEl) {
    String sourceTag = sourceEl.tagName();
    org.jsoup.nodes.Attributes destAttrs = new org.jsoup.nodes.Attributes();
    Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);

    org.jsoup.nodes.Attributes sourceAttrs = sourceEl.attributes();
    for (Attribute sourceAttr : sourceAttrs) {
        destAttrs.put(sourceAttr);// w  ww .j a v a  2s  . com
    }

    return dest;
}

From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java

private String generateImageAlternativeText(Element element) {
    return Optional.ofNullable(element.attributes().get(ALT_TAG)).map(StringUtils::normalizeSpace)
            .filter(s -> !s.isEmpty()).map(s -> "[" + s + "]").orElse("");
}

From source file:org.asqatasun.rules.elementselector.CaptchaElementSelector.java

/**
 *
 * @param element/*www.ja v  a2s .c om*/
 * @return wheter either one attribute of the current element, either its
 * text, either one attribute of one of its parent or the text of one of
 * its parents contains the "captcha" keyword
 */
private boolean parseAttributeToExtractCaptcha(Element element) {
    if (element.nodeName().equalsIgnoreCase(HTML_ELEMENT)
            || element.nodeName().equalsIgnoreCase(BODY_ELEMENT)) {
        return false;
    }
    if (StringUtils.containsIgnoreCase(element.ownText(), CAPTCHA_KEY)) {
        return true;
    } else {
        for (Attribute attr : element.attributes()) {
            if (StringUtils.containsIgnoreCase(attr.getValue(), CAPTCHA_KEY)) {
                return true;
            }
        }
    }
    return false;
}

From source file:org.dswarm.xmlenhancer.XMLEnhancer.java

private static void enhanceNodes(final List<Node> nodes) {

    nodes.forEach(node -> {/*from  www.jav a2  s  .  c om*/

        if (node instanceof Element) {

            Element element = (Element) node;

            final Attributes attributes = element.attributes();

            enhanceAttributes(attributes);

            final List<Node> childNodes = element.childNodes();

            enhanceNodes(childNodes);

            return;
        }

        if (node instanceof TextNode) {

            enhanceTextNode(node);
        }
    });
}

From source file:org.norvelle.addressdiscoverer.parse.unstructured.ForwardsFlattenedDocumentIterator.java

private String extractText(Element currElement) {
    StringBuilder sb = new StringBuilder();
    Attributes attrs = currElement.attributes();
    for (Attribute attr : attrs.asList()) {
        String attrValue = attr.getValue();
        sb.append(attr.getKey()).append(": ").append(attrValue).append("\n");
    }/*from w w  w .j  a v a 2s  .  co m*/
    sb.append(currElement.ownText());
    return sb.toString();
}

From source file:org.opens.tanaguru.rules.elementchecker.helper.RuleCheckHelper.java

/**
 * This methods parses all the elements retrieved from the scope, extracts
 * the ones where the occurrence "captcha" is found among the attribute values
 * and removes these elements from the initial set of elements.
 * //from  w  w  w  .  j ava2 s . c o m
 * @param elements
 * @return 
 */
public static Elements extractCaptchaElements(Elements elements) {
    Elements captchaElements = new Elements();
    for (Element el : elements) {
        for (Attribute attr : el.attributes()) {
            if (StringUtils.containsIgnoreCase(attr.getValue(), CAPTCHA_KEYWORD)) {
                captchaElements.add(el);
                break;
            }
        }
        for (Element pel : el.parents()) {
            for (Attribute attr : pel.attributes()) {
                if (StringUtils.containsIgnoreCase(attr.getValue(), CAPTCHA_KEYWORD)) {
                    captchaElements.add(el);
                    break;
                }
            }
        }
    }
    elements.removeAll(captchaElements);
    return captchaElements;
}

From source file:uk.co.certait.htmlexporter.writer.AbstractTableCellWriter.java

/**
 * /*from  w  w  w .j a  v a 2 s .  com*/
 * @param element
 * 
 * @return
 */
protected boolean isFunctionOutputCell(Element element) {
    boolean functionOutputCell = false;

    for (Attribute attribute : element.attributes()) {
        if (attribute.getKey().equalsIgnoreCase(DATA_GROUP_OUTPUT_ATTRIBUTE)) {
            functionOutputCell = true;
            break;
        }
    }

    return functionOutputCell;
}