List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:com.astamuse.asta4d.render.RenderUtil.java
private final static void apply(Element target, List<Renderer> rendererList, RenderAction renderAction, int startIndex, int count) { // The renderer list have to be applied recursively because the // transformer will always return a new Element clone. if (startIndex >= count) { return;/*www .j a v a2s .c om*/ } final Renderer currentRenderer = rendererList.get(startIndex); RendererType rendererType = currentRenderer.getRendererType(); switch (rendererType) { case GO_THROUGH: apply(target, rendererList, renderAction, startIndex + 1, count); return; /* case DEBUG: currentRenderer.getTransformerList().get(0).invoke(target); apply(target, rendererList, renderAction, startIndex + 1, count); return; */ case RENDER_ACTION: ((RenderActionRenderer) currentRenderer).getStyle().apply(renderAction); apply(target, rendererList, renderAction, startIndex + 1, count); return; default: // do nothing break; } String selector = currentRenderer.getSelector(); List<Transformer<?>> transformerList = currentRenderer.getTransformerList(); List<Element> elemList; if (PSEUDO_ROOT_SELECTOR.equals(selector)) { elemList = new LinkedList<Element>(); elemList.add(target); } else { elemList = new ArrayList<>(target.select(selector)); } if (elemList.isEmpty()) { if (rendererType == RendererType.ELEMENT_NOT_FOUND_HANDLER) { elemList.add(target); transformerList.clear(); transformerList.add( new RendererTransformer(((ElementNotFoundHandler) currentRenderer).alternativeRenderer())); } else if (renderAction.isOutputMissingSelectorWarning()) { String creationInfo = currentRenderer.getCreationSiteInfo(); if (creationInfo == null) { creationInfo = ""; } else { creationInfo = " at [ " + creationInfo + " ]"; } logger.warn( "There is no element found for selector [{}]{}, if it is deserved, try Renderer#disableMissingSelectorWarning() " + "to disable this message and Renderer#enableMissingSelectorWarning could enable this warning again in " + "your renderer chain", selector, creationInfo); apply(target, rendererList, renderAction, startIndex + 1, count); return; } } else { if (rendererType == RendererType.ELEMENT_NOT_FOUND_HANDLER) { apply(target, rendererList, renderAction, startIndex + 1, count); return; } } Element delayedElement = null; Element resultNode; // TODO we suppose that the element is listed as the order from parent // to children, so we reverse it. Perhaps we need a real order process // to ensure the wanted order. Collections.reverse(elemList); boolean renderForRoot; for (Element elem : elemList) { renderForRoot = PSEUDO_ROOT_SELECTOR.equals(selector) || rendererType == RendererType.ELEMENT_NOT_FOUND_HANDLER; if (!renderForRoot) { // faked group node will be not applied by renderers(only when the current selector is not the pseudo :root) if (elem.tagName().equals(ExtNodeConstants.GROUP_NODE_TAG) && ExtNodeConstants.GROUP_NODE_ATTR_TYPE_FAKE .equals(elem.attr(ExtNodeConstants.GROUP_NODE_ATTR_TYPE))) { continue; } } if (elem == target) { delayedElement = elem; continue; } for (Transformer<?> transformer : transformerList) { resultNode = transformer.invoke(elem); elem.before(resultNode); } // for transformer elem.remove(); } // for element // if the root element is one of the process targets, we can not apply // the left renderers to original element because it will be replaced by // a new element even it is not necessary (that is how Transformer // works). if (delayedElement == null) { apply(target, rendererList, renderAction, startIndex + 1, count); } else { if (rendererType == RendererType.ELEMENT_NOT_FOUND_HANDLER && delayedElement instanceof Document) { delayedElement = delayedElement.child(0); } for (Transformer<?> transformer : transformerList) { resultNode = transformer.invoke(delayedElement); delayedElement.before(resultNode); apply(resultNode, rendererList, renderAction, startIndex + 1, count); } // for transformer delayedElement.remove(); } }
From source file:jobhunter.infoempleo.Client.java
private String getCompany(final Document doc) { Element el = doc.getElementById("ctl00_CPH_Body_Logo_Empresa"); return el != null ? el.attr("title") : ""; }
From source file:com.astamuse.asta4d.web.form.field.SimpleFormFieldValueRenderer.java
protected Renderer hideTarget(final String targetSelector) { Renderer render = Renderer.create().disableMissingSelectorWarning(); return render.add(targetSelector, new ElementSetter() { @Override/*from ww w. j av a 2 s . c o m*/ public void set(Element elem) { String style = elem.attr("style"); if (style != null) { style = style.trim(); } if (StringUtils.isEmpty(style)) { style = "display:none"; } else { if (style.endsWith(";")) { style = style + "display:none"; } else { style = style + ";display:none"; } } elem.attr("style", style); } }).enableMissingSelectorWarning(); }
From source file:ch.admin.hermes.etl.load.HermesOnlineCrawler.java
/** * Liefert die URL's zu den Vorlagen /*from w w w .j a v a 2 s . c o m*/ * @param scenario Szenario * @return * @throws Exception Allgemeiner I/O Fehler */ public String[] getTemplatesURL(String scenario) throws Exception { ArrayList<String> s = new ArrayList<String>(); HttpGet get = new HttpGet(url + scenario_prefix + scenario + templates); HttpResponse response = httpClient.execute(get); HttpEntity entity = response.getEntity(); String pageHTML = EntityUtils.toString(entity); EntityUtils.consume(entity); Document document = Jsoup.parse(pageHTML); Elements elements = document.getElementsByAttribute("href"); for (Element e : elements) { String attr = e.attr("href"); if (attr.endsWith(".docx") || attr.endsWith(".xlsx") || attr.endsWith(".pptx")) s.add(url + scenario_prefix + scenario + templates + attr); } return (s.toArray(new String[s.size()])); }
From source file:com.crosstreelabs.cognitio.gumshoe.format.HtmlFormatHandler.java
@Override public void processLinks(final Visit visit) { try {/*w w w.ja v a 2 s . c o m*/ String charset = StringUtils.defaultIfBlank(visit.contentCharset, "UTF-8"); Document doc = Jsoup.parse(visit.contentStream, charset, visit.result.location); Elements anchors = doc.getElementsByTag("a"); for (Element e : anchors) { String url = stripURLFragmentIdentifier(e.attr("abs:href")); String uri = stripURLFragmentIdentifier(e.attr("href").toLowerCase()); if (uri.isEmpty() || url.isEmpty() || uri.contains("javascript:") || uri.contains("mailto:") || uri.contains("@")) { continue; } visit.discoveredLinks.add(URL.parse(url).toString()); // TODO Need to add the link text as the title } visit.contentStream.reset(); } catch (GalimatiasParseException | IOException ex) { throw new RuntimeException(ex); } }
From source file:ch.admin.hermes.etl.load.HermesOnlineCrawler.java
/** * Liefert alle Szenarion URL's /*www.ja v a2 s . co m*/ * @return * @throws Exception Allgemeiner I/O Fehler */ public String[] getScenarios() throws Exception { ArrayList<String> s = new ArrayList<String>(); HttpGet get = new HttpGet(url + scenarios); try { HttpResponse response = httpClient.execute(get); HttpEntity entity = response.getEntity(); String pageHTML = EntityUtils.toString(entity); EntityUtils.consume(entity); Document document = Jsoup.parse(pageHTML); Elements elements = document.getElementsByAttribute("href"); for (Element e : elements) { if (e.attr("href").startsWith("/szenarien")) { String attr = e.attr("href").substring(scenario_prefix.length()); attr = attr.substring(0, attr.lastIndexOf('/')); s.add(attr); } } } catch (Exception e) { JOptionPane.showMessageDialog(null, "Keine Online Verbindung mglich. Bitte Szenario manuell downloaden, entpacken und bei XMl Model eintragen.", "Keine Verbindung zu http://www.hermes.admin.ch", JOptionPane.WARNING_MESSAGE); } return (s.toArray(new String[s.size()])); }
From source file:org.javiermoreno.torrentscratcher.Runner.java
public List<String> getRecordsUrl(int page) throws IOException { List<String> result = new ArrayList<>(); String url = "http://www.elitetorrent.net/categoria/13/peliculas-hdrip/modo:listado/orden:valoracion/pag:{page}"; url = url.replace("{page}", String.valueOf(page)); Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a.nombre"); for (Element elem : links) { result.add(elem.attr("href")); }/*from ww w .j a v a 2 s. co m*/ return result; }
From source file:com.webcrawler.MailCrawlerService.java
/** * Gets the absolute mail urls.//from w ww . j a v a 2s .c o m * * @param linkElements the link elements * @param searchToken the search token * @return the absolute mail urls * @throws IOException Signals that an I/O exception has occurred. */ private List<String> getAbsoluteMailUrls(Elements linkElements, String searchToken) throws IOException { List<String> absoluteURLList = new ArrayList<String>(); List<Element> relativeURLList = new ArrayList<Element>(); for (Element linkElement : linkElements) { String absouleUrl = linkElement.attr("abs:href"); Elements anchorElements = getLinkElements(Jsoup.connect(absouleUrl).get(), "a"); CollectionUtils.select(anchorElements, getLinkFilterPredicate(getRegexMailUrlPattern(searchToken)), relativeURLList); } for (Element element : relativeURLList) { absoluteURLList.add(element.attr("abs:href")); } if (log.isDebugEnabled()) { log.debug("Absolute URL List: " + absoluteURLList.toString()); } return absoluteURLList; }
From source file:com.abixen.platform.core.service.impl.LayoutServiceImpl.java
@Override public String htmlLayoutToJson(String htmlString) { log.debug("htmlLayoutToJson() - htmlString: " + htmlString); Document doc = Jsoup.parse(htmlString); Elements htmlRows = doc.getElementsByClass("row"); List<LayoutRowUtil> rowUtilList = new ArrayList<>(); for (Element row : htmlRows) { Document rowDoc = Jsoup.parse(row.toString()); Elements htmlColumns = rowDoc.getElementsByClass("column"); List<LayoutColumnUtil> columnUtilList = new ArrayList<>(); for (Element column : htmlColumns) { String styleClass = column.attr("class"); columnUtilList.add(new LayoutColumnUtil(styleClass.substring(styleClass.indexOf(" ") + 1))); }/*from ww w . j av a2s . c o m*/ rowUtilList.add(new LayoutRowUtil(columnUtilList)); } return "{\"rows\":" + new Gson().toJson(rowUtilList) + "}"; }
From source file:com.webcrawler.MailCrawlerService.java
/** * Gets the link filter predicate./*from w w w .j a v a2s . co m*/ * * @param shouldVisitPattern the should visit pattern * @return the link filter predicate */ private Predicate getLinkFilterPredicate(final String shouldVisitPattern) { return new Predicate() { public boolean evaluate(Object arg0) { Pattern pattern = Pattern.compile(shouldVisitPattern); Element linkElement = (Element) arg0; String absoluteUrl = linkElement.attr("abs:href"); Matcher matcher = pattern.matcher(absoluteUrl); if (matcher.find()) { if (MailCrawlerService.log.isDebugEnabled()) { MailCrawlerService.log.debug("Should be visited: " + absoluteUrl); } return true; } if (MailCrawlerService.log.isDebugEnabled()) { MailCrawlerService.log.debug("Should not be visited: " + absoluteUrl); } return false; } }; }