List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:net.trustie.model.SFProject_Model.java
private void extractPageBluesteelUser(Document doc) { // if the project type is // bluesteel/* ww w . j av a2 s. c o m*/ // name Elements nameElements = doc.select("div#project-header section#project-title h1[itemprop=name]"); if (nameElements.size() > 0) { name = nameElements.get(0).text(); } // maintainers Elements maintainersElements = doc.select("div#project-header section#project-title p#maintainers a"); maintainers = maintainersElements.text(); // stars Elements starElements = doc.select( "article#project section#main-content section#call-to-action section#counts-sharing section.project-info section.content a[title=Browse reviews]"); if (starElements.size() > 0) { String strStar = starElements.get(0).text(); strStar = strStar.replaceAll("[^\\d\\.]", ""); stars = Float.parseFloat(strStar); } // downloadCount Elements downloadElements = doc.select( "article#project section#main-content section#call-to-action section#counts-sharing section#download-stats section.content a[title=Downloads This Week]"); if (downloadElements.size() > 0) { String strDownloadCount = downloadElements.get(0).text(); strDownloadCount = strDownloadCount.replaceAll("[^\\d]", ""); downloadCount = strDownloadCount; } // last update Elements lastUpdateElements = doc.select( "article#project section#main-content section#call-to-action section#counts-sharing section#last-updated section.content time.dateUpdated"); if (lastUpdateElements.size() > 0) { lastUpdate = lastUpdateElements.get(0).attr("datetime"); } // platform Elements platformElements = doc.select( "article#project section#main-content section#call-to-action section#download_button section.project-info"); if (platformElements.size() > 0) { platform = platformElements.text(); } // desc Elements descElements = doc.select("article#project section#main-content section#project-description p"); desc = descElements.text(); // categories Elements categoriesElements = doc.select( "article#project section#main-content section#project-categories-and-license div.project-container section:has(header:contains(Categories)) a"); categories = categoriesElements.text(); // license Elements licenseElements = doc.select( "article#project section#main-content section#project-categories-and-license div.project-container section:has(section.project-info header:contains(License)"); if (licenseElements.size() > 0) { license = licenseElements.get(0).select("section.project-info section.content").text(); } // license = licenseElements.html(); // feature Elements featureElements = doc.select( "article#project section#main-content section#project-features div[class=content editable]"); feature = featureElements.text(); // language+intended audience+user interface+programming // language+registered time Elements addtionalElements = doc.select( "article#project section#main-content section#project-additional-trove div.project-container section.project-info"); // System.out.println(addtionalElements.html()); for (int i = 0; i < addtionalElements.size(); i++) { Element element = addtionalElements.get(i); // System.out.println(element.html()); // System.out.println("*************************************"); Elements tags = element.select("header"); if (tags.size() > 0) { String tag = tags.text(); if (tag.equals("Languages")) { language = element.select("section.content").text(); } else if (tag.equals("Intended Audience")) { intendedAudience = element.select("section.content").text(); } else if (tag.equals("User Interface")) { userInterface = element.select("section.content").text(); } else if (tag.equals("Programming Language")) { programmingLanguage = element.select("section.content").text(); } else if (tag.equals("Registered")) { registeredTime = element.select("section.content").text(); } else { } } } }
From source file:nl.detoren.ijsco.io.OSBOLoader.java
private Spelers load(Document doc) { Spelers spelers = new Spelers(); int knsbnummer = 0; int knsbrating = 0; int osborating = 0; String vereniging = ""; int geboortejaar = 0; String categorie = ""; String naam = ""; Element table = doc.select("table").first(); Elements rows = table.select("tr"); for (Element row : rows) { Elements cells = row.select("td"); if (cells.size() > 7) { try { naam = cells.get(1).text(); } catch (Exception e) { naam = null;//from w w w .jav a 2 s .c o m System.out.println(e); } try { knsbnummer = Integer.parseInt(cells.get(8).text()); } catch (Exception e) { knsbnummer = 0; System.out.println(e); } try { osborating = Integer.parseInt(cells.get(3).text()); } catch (Exception e) { osborating = -1; System.out.println(e); } try { knsbrating = Integer.parseInt(cells.get(4).text()); } catch (Exception e) { knsbrating = -1; System.out.println(e); } try { vereniging = cells.get(2).text(); } catch (Exception e) { vereniging = ""; System.out.println(e); } try { geboortejaar = Integer.parseInt(cells.get(6).text()); } catch (Exception e) { geboortejaar = -1; System.out.println(e); } try { categorie = cells.get(7).text(); } catch (Exception e) { categorie = "-"; System.out.println(e); } Speler s = new Speler(knsbnummer, naam, vereniging, geboortejaar, categorie, osborating, knsbrating); spelers.add(s); } } return spelers; }
From source file:org.aliuge.crawler.extractor.selector.AbstractElementCssSelector.java
/** * ????/*from ww w. j a v a 2 s. c om*/ * @param elements * @return */ protected String getExtractText(Elements elements) { if (elements.size() == 0) return null; String temp = ""; if (attr.equalsIgnoreCase("tostring")) { return temp = elements.toString(); } else { if (index == -1 && StringUtils.isNotBlank(this.regex)) { for (Element e : elements) { Element element = e; if (element.select(this.regex).size() > 0) { return temp = e.text(); } } return temp; } else { if (index > -1 && index < elements.size()) { return elements.get(index).text(); } } return elements.first().text(); } /*if(attr.equals("tostring")){ if(index==0 || index>elements.size()) temp = elements.first().toString(); else temp = elements.get(index).toString(); }else{ if(index==0 || index>elements.size()) temp = elements.first().text(); else temp = elements.get(index).text(); } if(null!=pattern){ Matcher m = pattern.matcher(temp); if(m.find()){ temp = m.group(1); } }*/ //return temp; }
From source file:org.aliuge.crawler.extractor.selector.AbstractElementCssSelector.java
/** * ??????//from w w w. ja v a 2 s . co m * @param elements * @param attr * @return */ protected String getExtractAttr(Elements elements, String attr) { String temp = ""; if (attr.equalsIgnoreCase("tostring")) { return temp = elements.attr(attr).toString(); } else { if (index == -1 && StringUtils.isNotBlank(this.regex)) { for (Element e : elements) { Element element = e; if (element.select(this.regex).size() > 0) { return temp = e.attr(attr); } } return temp; } else { if (index > -1 && index < elements.size()) { return elements.get(index).attr(attr); } } return elements.first().attr(attr); } /*if(null!=pattern){ Matcher m = pattern.matcher(temp); if(m.find()){ temp = m.group(1); } }*/ //return temp; }
From source file:org.aliuge.crawler.jobconf.ExtractConfig.java
/** * ????// w ww . ja v a 2s. co m * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) { Elements extractElement = doc.select("extract"); super.setJobName(doc.select("job").attr("name")); super.setIndexName(doc.select("job").attr("indexName")); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } super.setExtractConfig(this); this.templates.add(extractTemplate); } //super.setExtractConfig(this); return this; }
From source file:org.apache.archiva.web.docs.RestDocsServlet.java
@Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { logger.debug("docs request to path: {}", req.getPathInfo()); String path = StringUtils.removeStart(req.getPathInfo(), "/"); InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(path); if (StringUtils.endsWith(path, ".xsd")) { StringEscapeUtils.escapeXml(resp.getWriter(), IOUtils.toString(is)); //IOUtils.copy( is, resp.getOutputStream() ); return;//from w w w . j ava2 s .com } String startPath = StringUtils.substringBefore(path, "/"); // replace all links !! Document document = Jsoup.parse(is, "UTF-8", ""); Element body = document.body().child(0); Elements links = body.select("a[href]"); for (Element link : links) { link.attr("href", "#" + startPath + "/" + link.attr("href")); } Elements datalinks = body.select("[data-href]"); for (Element link : datalinks) { link.attr("data-href", "#" + startPath + "/" + link.attr("data-href")); } Elements codes = body.select("code"); for (Element code : codes) { code.attr("class", code.attr("class") + " nice-code"); } //default generated enunciate use h1/h2/h3 which is quite big so transform to h3/h4/h5 Elements headers = body.select("h1"); for (Element header : headers) { header.tagName("h3"); } headers = body.select("h2"); for (Element header : headers) { header.tagName("h4"); } headers = body.select("h3"); for (Element header : headers) { header.tagName("h5"); } Document res = new Document(""); res.appendChild(body.select("div[id=main]").first()); Elements scripts = body.select("script"); for (Element script : scripts) { res.appendChild(script); } resp.getOutputStream().write(res.outerHtml().getBytes()); }
From source file:org.apdplat.superword.extract.SentenceExtractor.java
public static Map<String, String> parse(String html) { Map<String, String> sentences = new HashMap<>(); try {/*from w ww . j a v a 2 s.c o m*/ for (Element element : Jsoup.parse(html).select(SENTENCE_CSS_PATH)) { String en = null; String cn = null; Elements elements = element.select(EN_CSS_PATH); if (elements.size() == 1) { en = elements.get(0).text().trim(); LOGGER.info("???:" + en); if (en.split("\\s+").length < 2) { LOGGER.debug("???"); continue; } } elements = element.select(CN_CSS_PATH); if (elements.size() == 1) { cn = elements.get(0).text().trim(); LOGGER.info("???:" + cn); } if (StringUtils.isNotBlank(en) && StringUtils.isNotBlank(cn)) { sentences.put(en, cn); //? TextAnalyzer.seg(en).forEach(w -> { Word word = new Word(w, ""); WORD_FREQUENCE.putIfAbsent(word, new AtomicInteger()); WORD_FREQUENCE.get(word).incrementAndGet(); }); } } } catch (Exception e) { LOGGER.error("???", e); } return sentences; }
From source file:org.apdplat.superword.extract.SynonymAntonymExtractor.java
/** * ????/*w w w. j ava2s .c o m*/ * @param html * @return */ public static SynonymAntonym parseSynonymAntonym(String html, String word) { SynonymAntonym synonymAntonym = new SynonymAntonym(); synonymAntonym.setWord(new Word(word, "")); try { for (Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)) { String type = element.select(TYPE).text().trim(); LOGGER.debug("type:" + type); Elements elements = element.select(WORDS); for (Element ele : elements) { String w = ele.text().trim(); LOGGER.debug("word:" + w); if (StringUtils.isNotBlank(w)) { switch (type) { case "??": synonymAntonym.addSynonym(new Word(w, "")); break; case "???": synonymAntonym.addAntonym(new Word(w, "")); break; default: LOGGER.error("???????" + type); } } else { LOGGER.error("??????" + word); } } } LOGGER.info("??????" + synonymAntonym); } catch (Exception e) { LOGGER.error("??????", e); } return synonymAntonym; }
From source file:org.apdplat.superword.tools.Definition.java
public static List<String> parseDefinitionForWebster(String html, String cssPath) { List<String> list = new ArrayList<>(); try {//from w ww. j a va 2s. co m for (Element element : Jsoup.parse(html) .select("div.tense-box.quick-def-box.simple-def-box.card-box.def-text div.inner-box-wrapper")) { StringBuilder definition = new StringBuilder(); String partOfSpeech = element.select("div.word-attributes span.main-attr em").text().trim(); for (Element defElement : element.select( "div.definition-block.def-text ul.definition-list.no-count li p.definition-inner-item span")) { String def = defElement.text().trim(); if (def.length() < 3) { continue; } if (Character.isAlphabetic(def.charAt(0))) { def = ": " + def; } else { int index = 0; while (!Character.isAlphabetic(def.charAt(++index))) { // } def = ": " + def.substring(index); } definition.append(partOfSpeech).append(" ").append(def); list.add(definition.toString()); definition.setLength(0); } } } catch (Exception e) { LOGGER.error("?", e); } return list; }
From source file:org.apdplat.superword.tools.Definition.java
public static List<String> parseDefinitionForOxford(String html, String cssPath) { List<String> list = new ArrayList<>(); try {/*w w w . j a v a 2 s . c o m*/ for (Element element : Jsoup.parse(html).select("section.se1.senseGroup")) { StringBuilder definition = new StringBuilder(); String partOfSpeech = element.select("span.partOfSpeech").text().trim(); for (Element defElement : element.select("div.senseInnerWrapper")) { String seq = defElement.select("span.iteration").text().trim(); String def = defElement.select("span.definition").text().trim(); if (def.endsWith(":")) { def = def.substring(0, def.length() - 1); } definition.append(partOfSpeech).append(" ").append(seq).append(" ").append(def); list.add(definition.toString()); definition.setLength(0); } } } catch (Exception e) { LOGGER.error("?", e); } return list; }