List of usage examples for org.jsoup.nodes Element attributes
Attributes attributes
To view the source code for org.jsoup.nodes Element attributes.
Click Source Link
From source file:com.kingfong.webcrawler.util.DOMContentUtils.java
/** * This method finds all anchors below the supplied DOM * <code>node</code>, and creates appropriate {@link Outlink} * records for each (relative to the supplied <code>base</code> * URL), and adds them to the <code>outlinks</code> {@link * ArrayList}.//from w w w.j a va2s.c o m * * <p> * * Links without inner structure (tags, text, etc) are discarded, as * are links which contain only single nested links and empty text * nodes (this is a common DOM-fixup artifact, at least with * nekohtml). */ public void getOutlinks(String html, URL url, HashSet<String> outlinks) { Document document = Jsoup.parse(html); Elements elements = document.getAllElements(); for (Element currentNode : elements) { String nodeName = currentNode.tagName(); // short nodeType = currentNode.; Elements children = currentNode.children(); nodeName = nodeName.toLowerCase(); LinkParams params = linkParams.get(nodeName); if (params != null) { // if (!shouldThrowAwayLink(currentNode, children, childLen, // params)) { // StringBuilder linkText = new StringBuilder(); // getText(linkText, currentNode, true); Attributes attrs = currentNode.attributes(); String target = null; boolean noFollow = false; boolean post = false; Iterator<Attribute> iterator = attrs.iterator(); while (iterator.hasNext()) { Attribute attr = iterator.next(); String attrName = attr.getKey(); if (params.attrName.equalsIgnoreCase(attrName)) { target = attr.getValue(); } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getValue())) { noFollow = true; } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getValue())) { post = true; } } if (StringUtils.startsWith(target, "/")) { target = url.getProtocol() + "://" + url.getHost() + target; } if (target != null && URLFilter.filt(target)) { outlinks.add(target); } // } // this should not have any children, skip them if (params.childLen == 0) continue; } } }
From source file:com.iorga.iraj.servlet.AgglomeratorServlet.java
private long searchAndAppendAfter(final ServletConfig config, final Element agglomerateElement, final String scriptSrc, final String pathPrefix, final String pathSuffix, final String urlAttribute, long lastModified) throws MalformedURLException, IOException, URISyntaxException { if (mode == Mode.DEVELOPMENT) { // add a watch for that directory final Path path = Paths.get(config.getServletContext().getRealPath(scriptSrc)); path.register(watchService, StandardWatchEventKinds.ENTRY_CREATE, StandardWatchEventKinds.ENTRY_DELETE); }//from w w w . j a v a 2 s .c om final Set<String> childrenPaths = config.getServletContext().getResourcePaths(scriptSrc); for (final String path : childrenPaths) { if (path.endsWith(pathSuffix)) { // add that JS final StringBuilder targetScript = new StringBuilder("<"); targetScript.append(agglomerateElement.tagName()); // copy all the origin attributes for (final Attribute attribute : agglomerateElement.attributes()) { final String key = attribute.getKey(); if (!ATTRIBUTE_NAME.equalsIgnoreCase(key) && !urlAttribute.equalsIgnoreCase(key) && !URL_ATTRIBUTE_ATTRIBUTE_NAME.equalsIgnoreCase(key)) { targetScript.append(" ").append(attribute.html()); } } // specify the src path final String childUrl = StringUtils.removeStart(path, pathPrefix); targetScript.append(" ").append(new Attribute(urlAttribute, childUrl).html()).append(" />"); agglomerateElement.after(targetScript.toString()); lastModified = Math.max( config.getServletContext().getResource(childUrl).openConnection().getLastModified(), lastModified); } else if (path.endsWith("/")) { // it's a directory, recurse search & append lastModified = Math.max(searchAndAppendAfter(config, agglomerateElement, path, pathPrefix, pathSuffix, urlAttribute, lastModified), lastModified); } } return lastModified; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * pulls out videos we like/* www .j a v a 2s .c om*/ * * @return */ private ArrayList<Element> extractVideos(Element node) { ArrayList<Element> candidates = new ArrayList<Element>(); ArrayList<Element> goodMovies = new ArrayList<Element>(); try { Elements embeds = node.parent().getElementsByTag("embed"); for (Element el : embeds) { candidates.add(el); } Elements objects = node.parent().getElementsByTag("object"); for (Element el : objects) { candidates.add(el); } if (logger.isDebugEnabled()) { logger.debug("extractVideos: Starting to extract videos. Found: " + candidates.size()); } for (Element el : candidates) { Attributes attrs = el.attributes(); for (Attribute a : attrs) { try { if (logger.isDebugEnabled()) { logger.debug(a.getKey() + " : " + a.getValue()); } if ((a.getValue().contains("youtube") || a.getValue().contains("vimeo")) && a.getKey().equals("src")) { if (logger.isDebugEnabled()) { logger.debug("Found video... setting"); logger.debug("This page has a video!: " + a.getValue()); } goodMovies.add(el); } } catch (Exception e) { logger.error(e.toString()); e.printStackTrace(); } } } } catch (NullPointerException e) { logger.error(e.toString(), e); } catch (Exception e) { logger.error(e.toString(), e); } if (logger.isDebugEnabled()) { logger.debug("extractVideos: done looking videos"); } return goodMovies; }
From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java
private static Element createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); org.jsoup.nodes.Attributes destAttrs = new org.jsoup.nodes.Attributes(); Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); org.jsoup.nodes.Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { destAttrs.put(sourceAttr);// w ww .j a v a 2s . com } return dest; }
From source file:org.apache.james.jmap.utils.JsoupHtmlTextExtractor.java
private String generateImageAlternativeText(Element element) { return Optional.ofNullable(element.attributes().get(ALT_TAG)).map(StringUtils::normalizeSpace) .filter(s -> !s.isEmpty()).map(s -> "[" + s + "]").orElse(""); }
From source file:org.asqatasun.rules.elementselector.CaptchaElementSelector.java
/** * * @param element/*www.ja v a2s .c om*/ * @return wheter either one attribute of the current element, either its * text, either one attribute of one of its parent or the text of one of * its parents contains the "captcha" keyword */ private boolean parseAttributeToExtractCaptcha(Element element) { if (element.nodeName().equalsIgnoreCase(HTML_ELEMENT) || element.nodeName().equalsIgnoreCase(BODY_ELEMENT)) { return false; } if (StringUtils.containsIgnoreCase(element.ownText(), CAPTCHA_KEY)) { return true; } else { for (Attribute attr : element.attributes()) { if (StringUtils.containsIgnoreCase(attr.getValue(), CAPTCHA_KEY)) { return true; } } } return false; }
From source file:org.dswarm.xmlenhancer.XMLEnhancer.java
private static void enhanceNodes(final List<Node> nodes) { nodes.forEach(node -> {/*from www.jav a2 s . c om*/ if (node instanceof Element) { Element element = (Element) node; final Attributes attributes = element.attributes(); enhanceAttributes(attributes); final List<Node> childNodes = element.childNodes(); enhanceNodes(childNodes); return; } if (node instanceof TextNode) { enhanceTextNode(node); } }); }
From source file:org.norvelle.addressdiscoverer.parse.unstructured.ForwardsFlattenedDocumentIterator.java
private String extractText(Element currElement) { StringBuilder sb = new StringBuilder(); Attributes attrs = currElement.attributes(); for (Attribute attr : attrs.asList()) { String attrValue = attr.getValue(); sb.append(attr.getKey()).append(": ").append(attrValue).append("\n"); }/*from w w w .j a v a 2s . co m*/ sb.append(currElement.ownText()); return sb.toString(); }
From source file:org.opens.tanaguru.rules.elementchecker.helper.RuleCheckHelper.java
/** * This methods parses all the elements retrieved from the scope, extracts * the ones where the occurrence "captcha" is found among the attribute values * and removes these elements from the initial set of elements. * //from w w w . j ava2 s . c o m * @param elements * @return */ public static Elements extractCaptchaElements(Elements elements) { Elements captchaElements = new Elements(); for (Element el : elements) { for (Attribute attr : el.attributes()) { if (StringUtils.containsIgnoreCase(attr.getValue(), CAPTCHA_KEYWORD)) { captchaElements.add(el); break; } } for (Element pel : el.parents()) { for (Attribute attr : pel.attributes()) { if (StringUtils.containsIgnoreCase(attr.getValue(), CAPTCHA_KEYWORD)) { captchaElements.add(el); break; } } } } elements.removeAll(captchaElements); return captchaElements; }
From source file:uk.co.certait.htmlexporter.writer.AbstractTableCellWriter.java
/** * /*from w w w .j a v a 2 s . com*/ * @param element * * @return */ protected boolean isFunctionOutputCell(Element element) { boolean functionOutputCell = false; for (Attribute attribute : element.attributes()) { if (attribute.getKey().equalsIgnoreCase(DATA_GROUP_OUTPUT_ATTRIBUTE)) { functionOutputCell = true; break; } } return functionOutputCell; }