List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:org.apdplat.superword.tools.SentenceExtractor.java
public static Map<String, String> parse(String html) { Map<String, String> sentences = new HashMap<>(); try {/* ww w. j a v a 2s .c o m*/ for (Element element : Jsoup.parse(html).select(SENTENCE_CSS_PATH)) { String en = null; String cn = null; Elements elements = element.select(EN_CSS_PATH); if (elements.size() == 1) { en = elements.get(0).text().trim(); LOGGER.debug("???:" + en); if (en.split("\\s+").length < 2) { LOGGER.debug("???"); continue; } } elements = element.select(CN_CSS_PATH); if (elements.size() == 1) { cn = elements.get(0).text().trim(); LOGGER.debug("???:" + cn); } if (StringUtils.isNotBlank(en) && StringUtils.isNotBlank(cn)) { sentences.put(en, cn); //? TextAnalyzer.seg(en).forEach(w -> { Word word = new Word(w, ""); WORD_FREQUENCE.putIfAbsent(word, new AtomicInteger()); WORD_FREQUENCE.get(word).incrementAndGet(); }); } } } catch (Exception e) { LOGGER.error("???", e); } return sentences; }
From source file:org.asqatasun.rules.elementselector.AreaElementSelector.java
@Override public void selectElements(SSPHandler sspHandler, ElementHandler<Element> elementHandler) { super.selectElements(sspHandler, elementHandler); if (elementHandler.isEmpty()) { return;//from ww w .ja v a2s. co m } Elements mapElementsAssociatedWithImg = new Elements(); // for each map element, search the associated image and store the // element in a new collection for (Element map : elementHandler.get()) { if (isMapAssociatedWithImage(sspHandler, map)) { mapElementsAssociatedWithImg.add(map); } } elementHandler.clean(); // for all well-formed maps, keep all the area children and return them for (Element map : mapElementsAssociatedWithImg) { elementHandler.addAll(map.select(areaSelectionKey)); } }
From source file:org.asqatasun.rules.elementselector.LinkElementSelector.java
/** * /*from ww w . j av a2 s . co m*/ * @param element * @return whether one of the preceding sibling is of heading type */ private boolean isOneOfPrecedingSiblingofHeadingType(Element element) { Element prevElementSibling = element.previousElementSibling(); while (prevElementSibling != null) { if (PREV_SIBLING_CONTEXT_ELEMENTS.contains(prevElementSibling.tagName()) || !prevElementSibling.select(CssLikeQueryStore.HEADINGS_CSS_LIKE_QUERY).isEmpty()) { return true; } prevElementSibling = prevElementSibling.previousElementSibling(); } return false; }
From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule010204.java
/** * // ww w .j a v a2s . co m * @param svgElements * @param svgElementsWithoutRoleImage * @param ariaAttrOnSvgOrChild * @param svgElementsWithDescOrTitleChild * @param titleAttrOnSvgOrChild */ private void extractMalformedPatternDetectedElements(ElementHandler<Element> svgElements, ElementHandler<Element> svgElementsWithoutRoleImage, ElementHandler<Element> ariaAttrOnSvgOrChild, ElementHandler<Element> svgElementsWithDescOrTitleChild, ElementHandler<Element> titleAttrOnSvgOrChild, ElementHandler<Element> wellFormedSvgElements) { for (Element element : svgElements.get()) { boolean patternDetected = false; if (!StringUtils.equalsIgnoreCase(element.attr(ROLE_ATTR), "img")) { svgElementsWithoutRoleImage.add(element); patternDetected = true; } if (element.hasAttr(ARIA_LABEL_ATTR) || element.hasAttr(ARIA_LABELLEDBY_ATTR) || element.hasAttr(ARIA_DESCRIBEDBY_ATTR) || !element.select(ARIA_DESCRIBEDBY_CSS_LIKE_QUERY + "," + ARIA_LABEL_CSS_LIKE_QUERY + "," + ARIA_LABELLEDBY_CSS_LIKE_QUERY).isEmpty()) { ariaAttrOnSvgOrChild.add(element); patternDetected = true; } if (!element.select(NOT_EMPTY_ARIA_TITLE_CSS_LIKE_QUERY + "," + NOT_EMPTY_ARIA_DESC_CSS_LIKE_QUERY) .isEmpty()) { svgElementsWithDescOrTitleChild.add(element); patternDetected = true; } if (element.hasAttr(TITLE_ELEMENT) || !element.select("[title]").isEmpty()) { titleAttrOnSvgOrChild.add(element); patternDetected = true; } if (wellFormedSvgElements != null && !patternDetected) { wellFormedSvgElements.add(element); } } }
From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule010306.java
/** * // ww w . j a va 2 s .c o m * @param element * @return whether the aria label and the title are present, not empty * and identical */ private boolean isChildDescExists(Element element) { return !element.select(DESC_ELEMENT).isEmpty(); }
From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule010306.java
/** * //from www. java 2 s . co m * @param element * @return the textual content of the desc tag */ private String getDescText(Element element) { return element.select(DESC_ELEMENT).first().text(); }
From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule050801.java
/** * /*from www. j a va 2 s . c om*/ * @param sspHandler * @param elementHandler * @param elementHandlerWithoutDataTableMarkup */ private void extractTableWithDataTableMarkup(ElementHandler<Element> elementHandler, ElementHandler<Element> elementHandlerWithoutDataTableMarkup) { Elements elementsWithMarkup = new Elements(); for (Element el : elementHandler.get()) { if (el.select(DATA_TABLE_MARKUP_CSS_LIKE_QUERY).size() > 0) { elementsWithMarkup.add(el); } else if (elementHandlerWithoutDataTableMarkup != null) { elementHandlerWithoutDataTableMarkup.add(el); } } elementHandler.clean().addAll(elementsWithMarkup); }
From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule110102.java
/** * This method linked each label which have an input child on a page to its * form in a map./* ww w .java2 s . co m*/ */ private void putLabelElementHandlerIntoTheMap() { for (Element el : labelElementHandler.get()) { Element tmpElement = el.parent(); while (tmpElement != null && StringUtils.isNotBlank(tmpElement.tagName())) { if (tmpElement.tagName().equals(FORM_ELEMENT)) { if (labelFormMap.containsKey(tmpElement)) { Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY); if (!els.isEmpty()) { labelFormMap.get(tmpElement).add(el); } } else { Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY); if (!els.isEmpty()) { ElementHandler<Element> labelElement = new ElementHandlerImpl(); labelElement.add(el); labelFormMap.put(tmpElement, labelElement); } } break; } tmpElement = tmpElement.parent(); } } }
From source file:org.b3log.wordman.word.Main.java
/** * ?.//from ww w .j a v a 2s . c o m * * @param args ? * @throws java.lang.Exception */ public static void main(final String[] args) throws Exception { final Clazz clazz = new Clazz(); clazz.setId(CLASS_ID); clazz.setName(CLASS_NAME); final List<Word> classWords = new ArrayList<Word>(); clazz.setWords(classWords); for (int clazzNum = 1; clazzNum <= CLASS_NUM; clazzNum++) { final Connection.Response response = Jsoup .connect("http://word.iciba.com/?action=words&class=" + clazz.getId() + "&course=" + clazzNum) .userAgent("Mozilla").timeout(TIMEOUT).execute(); final Document document = response.parse(); int classWordCnt = 0; for (int i = 1; i <= PAGE; i++) { final Elements wordList = document.select("ul#word_list_" + i); final Elements wordLi = wordList.select("li"); for (final Element li : wordLi) { final Word word = new Word(); word.setId(UUID.randomUUID().toString().replaceAll("-", "")); final Element w = li.select("div.word_main_list_w").get(0); String spell = w.select("span").get(0).attr("title"); // ?? spell = spell.replace("*", "").replaceAll("\\(.*\\)", "").replace("\\", ""); spell = spell.trim(); word.setWord(spell); if (!checkWord(spell)) { // throw new IllegalStateException(" [" + spell + ']'); } final Element y = li.select("div.word_main_list_y").get(0); word.setPhon(y.select("strong").get(0).text()); word.setPron(y.select("a").get(0).id()); final Element s = li.select("div.word_main_list_s").get(0); word.setPara(s.select("span").get(0).text()); // ??? word.setBuild(""); word.setExample(""); // System.out.println(word.toString()); classWords.add(word); classWordCnt++; } } System.out.println("? [" + clazzNum + "] ??? [" + classWordCnt + "]"); } final StringBuilder sqlBuilder = new StringBuilder(); final List<String> sqls = clazz.toSQLs(); for (final String sql : sqls) { System.out.println(sql); sqlBuilder.append(sql).append(IOUtils.LINE_SEPARATOR); } final OutputStream outputStream = new FileOutputStream(new File("C:\\" + CLASS_NAME + ".sql")); IOUtils.write(sqlBuilder.toString(), outputStream, "UTF-8"); IOUtils.closeQuietly(outputStream); }
From source file:org.bigmouth.tfc.v1.PageImpl.java
@Override public List<Item> getItems() { if (null == this.asynSearchDoc) { throw new IllegalStateException("Please do init."); }//from ww w . j a v a 2 s .c om List<Item> result = Lists.newArrayList(); Elements itemLines = asynSearchDoc.select("div.J_TItems div"); for (Element line : itemLines) { Elements items = line.select(".item"); for (Element item : items) { Elements data = item.select(".J_TGoldData"); if (CollectionUtils.isNotEmpty(data)) { // ? Item o = new Item(); Elements a = item.select(".detail a"); String name = a.text(); if (LOGGER.isInfoEnabled()) { LOGGER.info("Captured item: {}", name); } o.setName(name); Elements priceElements = item.select(".detail .attribute .cprice-area .c-price"); if (CollectionUtils.isNotEmpty(priceElements)) { Element price = priceElements.get(0); o.setPrice(new BigDecimal(price.text())); } Element datainf = data.get(0); String href = Constants.PROTOCOL_PREFIX + datainf.attr("href"); o.setUrl(href); DetailPage dp = new DetailPage(href); o.parseAttributes(dp.getAttributes()); result.add(o); } else { // ?? } } } return result; }