Example usage for org.jsoup.nodes Element select

List of usage examples for org.jsoup.nodes Element select

Introduction

In this page you can find the example usage for org.jsoup.nodes Element select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:org.apdplat.superword.tools.SentenceExtractor.java

public static Map<String, String> parse(String html) {
    Map<String, String> sentences = new HashMap<>();
    try {/*  ww  w. j a  v  a 2s .c o  m*/
        for (Element element : Jsoup.parse(html).select(SENTENCE_CSS_PATH)) {
            String en = null;
            String cn = null;
            Elements elements = element.select(EN_CSS_PATH);
            if (elements.size() == 1) {
                en = elements.get(0).text().trim();
                LOGGER.debug("???:" + en);
                if (en.split("\\s+").length < 2) {
                    LOGGER.debug("???");
                    continue;
                }
            }
            elements = element.select(CN_CSS_PATH);
            if (elements.size() == 1) {
                cn = elements.get(0).text().trim();
                LOGGER.debug("???:" + cn);
            }
            if (StringUtils.isNotBlank(en) && StringUtils.isNotBlank(cn)) {
                sentences.put(en, cn);
                //?
                TextAnalyzer.seg(en).forEach(w -> {
                    Word word = new Word(w, "");
                    WORD_FREQUENCE.putIfAbsent(word, new AtomicInteger());
                    WORD_FREQUENCE.get(word).incrementAndGet();
                });
            }
        }
    } catch (Exception e) {
        LOGGER.error("???", e);
    }
    return sentences;
}

From source file:org.asqatasun.rules.elementselector.AreaElementSelector.java

@Override
public void selectElements(SSPHandler sspHandler, ElementHandler<Element> elementHandler) {
    super.selectElements(sspHandler, elementHandler);
    if (elementHandler.isEmpty()) {
        return;//from ww  w  .ja v  a2s.  co  m
    }
    Elements mapElementsAssociatedWithImg = new Elements();
    // for each map element, search the associated image and store the 
    // element in a new collection
    for (Element map : elementHandler.get()) {
        if (isMapAssociatedWithImage(sspHandler, map)) {
            mapElementsAssociatedWithImg.add(map);
        }
    }

    elementHandler.clean();

    // for all well-formed maps, keep all the area children and return them
    for (Element map : mapElementsAssociatedWithImg) {
        elementHandler.addAll(map.select(areaSelectionKey));
    }
}

From source file:org.asqatasun.rules.elementselector.LinkElementSelector.java

/**
 * /*from ww  w .  j  av a2  s  . co  m*/
 * @param element
 * @return whether one of the preceding sibling is of heading type
 */
private boolean isOneOfPrecedingSiblingofHeadingType(Element element) {
    Element prevElementSibling = element.previousElementSibling();
    while (prevElementSibling != null) {
        if (PREV_SIBLING_CONTEXT_ELEMENTS.contains(prevElementSibling.tagName())
                || !prevElementSibling.select(CssLikeQueryStore.HEADINGS_CSS_LIKE_QUERY).isEmpty()) {
            return true;
        }
        prevElementSibling = prevElementSibling.previousElementSibling();
    }
    return false;
}

From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule010204.java

/**
 * // ww  w  .j a v a2s  . co m
 * @param svgElements
 * @param svgElementsWithoutRoleImage
 * @param ariaAttrOnSvgOrChild
 * @param svgElementsWithDescOrTitleChild
 * @param titleAttrOnSvgOrChild 
 */
private void extractMalformedPatternDetectedElements(ElementHandler<Element> svgElements,
        ElementHandler<Element> svgElementsWithoutRoleImage, ElementHandler<Element> ariaAttrOnSvgOrChild,
        ElementHandler<Element> svgElementsWithDescOrTitleChild, ElementHandler<Element> titleAttrOnSvgOrChild,
        ElementHandler<Element> wellFormedSvgElements) {
    for (Element element : svgElements.get()) {
        boolean patternDetected = false;
        if (!StringUtils.equalsIgnoreCase(element.attr(ROLE_ATTR), "img")) {
            svgElementsWithoutRoleImage.add(element);
            patternDetected = true;
        }
        if (element.hasAttr(ARIA_LABEL_ATTR) || element.hasAttr(ARIA_LABELLEDBY_ATTR)
                || element.hasAttr(ARIA_DESCRIBEDBY_ATTR) || !element.select(ARIA_DESCRIBEDBY_CSS_LIKE_QUERY
                        + "," + ARIA_LABEL_CSS_LIKE_QUERY + "," + ARIA_LABELLEDBY_CSS_LIKE_QUERY).isEmpty()) {
            ariaAttrOnSvgOrChild.add(element);
            patternDetected = true;
        }
        if (!element.select(NOT_EMPTY_ARIA_TITLE_CSS_LIKE_QUERY + "," + NOT_EMPTY_ARIA_DESC_CSS_LIKE_QUERY)
                .isEmpty()) {
            svgElementsWithDescOrTitleChild.add(element);
            patternDetected = true;
        }
        if (element.hasAttr(TITLE_ELEMENT) || !element.select("[title]").isEmpty()) {
            titleAttrOnSvgOrChild.add(element);
            patternDetected = true;
        }
        if (wellFormedSvgElements != null && !patternDetected) {
            wellFormedSvgElements.add(element);
        }
    }
}

From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule010306.java

/**
 * //  ww w  .  j a  va  2 s .c o  m
 * @param element
 * @return whether the aria label and the title are present, not empty 
 * and identical
 */
private boolean isChildDescExists(Element element) {
    return !element.select(DESC_ELEMENT).isEmpty();
}

From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule010306.java

/**
 * //from  www.  java  2 s .  co m
 * @param element
 * @return the textual content of the desc tag
 */
private String getDescText(Element element) {
    return element.select(DESC_ELEMENT).first().text();
}

From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule050801.java

/**
 * /*from  www. j a va 2 s .  c  om*/
 * @param sspHandler
 * @param elementHandler 
 * @param elementHandlerWithoutDataTableMarkup
 */
private void extractTableWithDataTableMarkup(ElementHandler<Element> elementHandler,
        ElementHandler<Element> elementHandlerWithoutDataTableMarkup) {

    Elements elementsWithMarkup = new Elements();

    for (Element el : elementHandler.get()) {
        if (el.select(DATA_TABLE_MARKUP_CSS_LIKE_QUERY).size() > 0) {
            elementsWithMarkup.add(el);
        } else if (elementHandlerWithoutDataTableMarkup != null) {
            elementHandlerWithoutDataTableMarkup.add(el);
        }
    }
    elementHandler.clean().addAll(elementsWithMarkup);
}

From source file:org.asqatasun.rules.rgaa30.Rgaa30Rule110102.java

/**
 * This method linked each label which have an input child on a page to its
 * form in a map./*  ww w .java2 s  .  co m*/
 */
private void putLabelElementHandlerIntoTheMap() {
    for (Element el : labelElementHandler.get()) {
        Element tmpElement = el.parent();

        while (tmpElement != null && StringUtils.isNotBlank(tmpElement.tagName())) {
            if (tmpElement.tagName().equals(FORM_ELEMENT)) {
                if (labelFormMap.containsKey(tmpElement)) {
                    Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY);
                    if (!els.isEmpty()) {
                        labelFormMap.get(tmpElement).add(el);
                    }
                } else {
                    Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY);
                    if (!els.isEmpty()) {
                        ElementHandler<Element> labelElement = new ElementHandlerImpl();
                        labelElement.add(el);
                        labelFormMap.put(tmpElement, labelElement);
                    }
                }
                break;
            }
            tmpElement = tmpElement.parent();
        }
    }
}

From source file:org.b3log.wordman.word.Main.java

/**
 * ?.//from  ww  w .j a v a 2s .  c o  m
 *
 * @param args ?
 * @throws java.lang.Exception 
 */
public static void main(final String[] args) throws Exception {
    final Clazz clazz = new Clazz();
    clazz.setId(CLASS_ID);
    clazz.setName(CLASS_NAME);
    final List<Word> classWords = new ArrayList<Word>();
    clazz.setWords(classWords);

    for (int clazzNum = 1; clazzNum <= CLASS_NUM; clazzNum++) {
        final Connection.Response response = Jsoup
                .connect("http://word.iciba.com/?action=words&class=" + clazz.getId() + "&course=" + clazzNum)
                .userAgent("Mozilla").timeout(TIMEOUT).execute();

        final Document document = response.parse();

        int classWordCnt = 0;
        for (int i = 1; i <= PAGE; i++) {
            final Elements wordList = document.select("ul#word_list_" + i);
            final Elements wordLi = wordList.select("li");

            for (final Element li : wordLi) {
                final Word word = new Word();
                word.setId(UUID.randomUUID().toString().replaceAll("-", ""));

                final Element w = li.select("div.word_main_list_w").get(0);
                String spell = w.select("span").get(0).attr("title");

                // ??
                spell = spell.replace("*", "").replaceAll("\\(.*\\)", "").replace("\\", "");

                spell = spell.trim();

                word.setWord(spell);
                if (!checkWord(spell)) { // 
                    throw new IllegalStateException(" [" + spell + ']');
                }

                final Element y = li.select("div.word_main_list_y").get(0);
                word.setPhon(y.select("strong").get(0).text());
                word.setPron(y.select("a").get(0).id());

                final Element s = li.select("div.word_main_list_s").get(0);
                word.setPara(s.select("span").get(0).text());

                // ???
                word.setBuild("");
                word.setExample("");

                // System.out.println(word.toString());
                classWords.add(word);
                classWordCnt++;
            }
        }

        System.out.println("? [" + clazzNum + "] ??? [" + classWordCnt + "]");
    }

    final StringBuilder sqlBuilder = new StringBuilder();

    final List<String> sqls = clazz.toSQLs();
    for (final String sql : sqls) {
        System.out.println(sql);
        sqlBuilder.append(sql).append(IOUtils.LINE_SEPARATOR);
    }

    final OutputStream outputStream = new FileOutputStream(new File("C:\\" + CLASS_NAME + ".sql"));
    IOUtils.write(sqlBuilder.toString(), outputStream, "UTF-8");
    IOUtils.closeQuietly(outputStream);
}

From source file:org.bigmouth.tfc.v1.PageImpl.java

@Override
public List<Item> getItems() {
    if (null == this.asynSearchDoc) {
        throw new IllegalStateException("Please do init.");
    }//from  ww w  .  j  a  v a 2  s .c  om
    List<Item> result = Lists.newArrayList();
    Elements itemLines = asynSearchDoc.select("div.J_TItems div");
    for (Element line : itemLines) {
        Elements items = line.select(".item");
        for (Element item : items) {
            Elements data = item.select(".J_TGoldData");
            if (CollectionUtils.isNotEmpty(data)) {
                // ?
                Item o = new Item();
                Elements a = item.select(".detail a");
                String name = a.text();
                if (LOGGER.isInfoEnabled()) {
                    LOGGER.info("Captured item: {}", name);
                }
                o.setName(name);

                Elements priceElements = item.select(".detail .attribute .cprice-area .c-price");
                if (CollectionUtils.isNotEmpty(priceElements)) {
                    Element price = priceElements.get(0);
                    o.setPrice(new BigDecimal(price.text()));
                }

                Element datainf = data.get(0);
                String href = Constants.PROTOCOL_PREFIX + datainf.attr("href");

                o.setUrl(href);

                DetailPage dp = new DetailPage(href);
                o.parseAttributes(dp.getAttributes());

                result.add(o);
            } else {
                // ??
            }
        }
    }
    return result;
}