Example usage for org.jsoup.nodes Element select

List of usage examples for org.jsoup.nodes Element select

Introduction

In this page you can find the example usage for org.jsoup.nodes Element select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:net.trustie.model.SFProject_Model.java

private void extractPageBluesteelUser(Document doc) {
    // if the project type is
    // bluesteel/* ww w  .  j  av  a2  s.  c o  m*/
    // name
    Elements nameElements = doc.select("div#project-header section#project-title h1[itemprop=name]");
    if (nameElements.size() > 0) {
        name = nameElements.get(0).text();
    }

    // maintainers
    Elements maintainersElements = doc.select("div#project-header section#project-title p#maintainers a");
    maintainers = maintainersElements.text();

    // stars
    Elements starElements = doc.select(
            "article#project section#main-content section#call-to-action section#counts-sharing section.project-info section.content a[title=Browse reviews]");
    if (starElements.size() > 0) {
        String strStar = starElements.get(0).text();
        strStar = strStar.replaceAll("[^\\d\\.]", "");
        stars = Float.parseFloat(strStar);
    }

    // downloadCount
    Elements downloadElements = doc.select(
            "article#project section#main-content section#call-to-action section#counts-sharing section#download-stats section.content a[title=Downloads This Week]");
    if (downloadElements.size() > 0) {
        String strDownloadCount = downloadElements.get(0).text();
        strDownloadCount = strDownloadCount.replaceAll("[^\\d]", "");
        downloadCount = strDownloadCount;
    }

    // last update
    Elements lastUpdateElements = doc.select(
            "article#project section#main-content section#call-to-action section#counts-sharing section#last-updated section.content time.dateUpdated");
    if (lastUpdateElements.size() > 0) {
        lastUpdate = lastUpdateElements.get(0).attr("datetime");
    }

    // platform
    Elements platformElements = doc.select(
            "article#project section#main-content section#call-to-action section#download_button section.project-info");
    if (platformElements.size() > 0) {
        platform = platformElements.text();
    }

    // desc
    Elements descElements = doc.select("article#project section#main-content section#project-description p");
    desc = descElements.text();

    // categories
    Elements categoriesElements = doc.select(
            "article#project section#main-content section#project-categories-and-license div.project-container section:has(header:contains(Categories)) a");
    categories = categoriesElements.text();

    // license
    Elements licenseElements = doc.select(
            "article#project section#main-content section#project-categories-and-license div.project-container section:has(section.project-info header:contains(License)");
    if (licenseElements.size() > 0) {
        license = licenseElements.get(0).select("section.project-info section.content").text();
    }
    // license = licenseElements.html();

    // feature
    Elements featureElements = doc.select(
            "article#project section#main-content section#project-features div[class=content editable]");
    feature = featureElements.text();

    // language+intended audience+user interface+programming
    // language+registered time
    Elements addtionalElements = doc.select(
            "article#project section#main-content section#project-additional-trove div.project-container section.project-info");
    // System.out.println(addtionalElements.html());
    for (int i = 0; i < addtionalElements.size(); i++) {
        Element element = addtionalElements.get(i);
        // System.out.println(element.html());
        // System.out.println("*************************************");
        Elements tags = element.select("header");
        if (tags.size() > 0) {
            String tag = tags.text();
            if (tag.equals("Languages")) {
                language = element.select("section.content").text();
            } else if (tag.equals("Intended Audience")) {
                intendedAudience = element.select("section.content").text();
            } else if (tag.equals("User Interface")) {
                userInterface = element.select("section.content").text();
            } else if (tag.equals("Programming Language")) {
                programmingLanguage = element.select("section.content").text();
            } else if (tag.equals("Registered")) {
                registeredTime = element.select("section.content").text();
            } else {

            }
        }

    }
}

From source file:nl.detoren.ijsco.io.OSBOLoader.java

private Spelers load(Document doc) {
    Spelers spelers = new Spelers();
    int knsbnummer = 0;
    int knsbrating = 0;
    int osborating = 0;
    String vereniging = "";
    int geboortejaar = 0;
    String categorie = "";
    String naam = "";
    Element table = doc.select("table").first();
    Elements rows = table.select("tr");
    for (Element row : rows) {
        Elements cells = row.select("td");
        if (cells.size() > 7) {
            try {
                naam = cells.get(1).text();
            } catch (Exception e) {
                naam = null;//from w w  w  .jav  a 2  s .c o  m
                System.out.println(e);
            }
            try {
                knsbnummer = Integer.parseInt(cells.get(8).text());
            } catch (Exception e) {
                knsbnummer = 0;
                System.out.println(e);
            }
            try {
                osborating = Integer.parseInt(cells.get(3).text());
            } catch (Exception e) {
                osborating = -1;
                System.out.println(e);
            }
            try {
                knsbrating = Integer.parseInt(cells.get(4).text());
            } catch (Exception e) {
                knsbrating = -1;
                System.out.println(e);
            }
            try {
                vereniging = cells.get(2).text();
            } catch (Exception e) {
                vereniging = "";
                System.out.println(e);
            }
            try {
                geboortejaar = Integer.parseInt(cells.get(6).text());
            } catch (Exception e) {
                geboortejaar = -1;
                System.out.println(e);
            }
            try {
                categorie = cells.get(7).text();
            } catch (Exception e) {
                categorie = "-";
                System.out.println(e);
            }

            Speler s = new Speler(knsbnummer, naam, vereniging, geboortejaar, categorie, osborating,
                    knsbrating);
            spelers.add(s);
        }
    }
    return spelers;
}

From source file:org.aliuge.crawler.extractor.selector.AbstractElementCssSelector.java

/**
 * ????/*from   ww w.  j  a v  a  2 s. c om*/
 * @param elements
 * @return
 */
protected String getExtractText(Elements elements) {
    if (elements.size() == 0)
        return null;
    String temp = "";

    if (attr.equalsIgnoreCase("tostring")) {
        return temp = elements.toString();
    } else {
        if (index == -1 && StringUtils.isNotBlank(this.regex)) {
            for (Element e : elements) {
                Element element = e;
                if (element.select(this.regex).size() > 0) {
                    return temp = e.text();
                }
            }
            return temp;
        } else {
            if (index > -1 && index < elements.size()) {
                return elements.get(index).text();
            }
        }
        return elements.first().text();
    }

    /*if(attr.equals("tostring")){
       if(index==0 || index>elements.size())
    temp = elements.first().toString();
       else
    temp = elements.get(index).toString();
    }else{
       if(index==0 || index>elements.size())
    temp = elements.first().text();
       else
    temp = elements.get(index).text();
    }
            
    if(null!=pattern){
       Matcher m = pattern.matcher(temp);
       if(m.find()){
    temp = m.group(1);
       }
    }*/
    //return temp;
}

From source file:org.aliuge.crawler.extractor.selector.AbstractElementCssSelector.java

/**
 * ??????//from  w  w  w.  ja v  a  2  s .  co  m
 * @param elements
 * @param attr
 * @return
 */
protected String getExtractAttr(Elements elements, String attr) {
    String temp = "";
    if (attr.equalsIgnoreCase("tostring")) {
        return temp = elements.attr(attr).toString();
    } else {
        if (index == -1 && StringUtils.isNotBlank(this.regex)) {
            for (Element e : elements) {
                Element element = e;
                if (element.select(this.regex).size() > 0) {
                    return temp = e.attr(attr);
                }
            }
            return temp;
        } else {
            if (index > -1 && index < elements.size()) {
                return elements.get(index).attr(attr);
            }
        }
        return elements.first().attr(attr);
    }
    /*if(null!=pattern){
       Matcher m = pattern.matcher(temp);
       if(m.find()){
    temp = m.group(1);
       }
    }*/
    //return temp;
}

From source file:org.aliuge.crawler.jobconf.ExtractConfig.java

/**
 * ????//  w ww  . ja  v  a  2s.  co  m
 * @param doc
 * @return
 * @throws ConfigurationException
 */
public ExtractConfig loadConfig(Document doc) {
    Elements extractElement = doc.select("extract");
    super.setJobName(doc.select("job").attr("name"));
    super.setIndexName(doc.select("job").attr("indexName"));
    String temp = extractElement.select("threadNum").text();
    if (StringUtils.isNotBlank(temp)) {
        this.threadNum = Integer.parseInt(temp);
    }

    Elements templateElement = extractElement.select("extract").select("template");
    Iterator<Element> it = templateElement.iterator();

    while (it.hasNext()) {
        Element template = it.next();
        ExtractTemplate extractTemplate = new ExtractTemplate();
        // ?Url????
        Elements urlPatternElement = template.select("url");
        List<Pattern> patterns = Lists.newArrayList();
        for (Element urlElement : urlPatternElement) {
            patterns.add(Pattern.compile(urlElement.text()));
        }
        extractTemplate.setUrlPattern(patterns);
        extractTemplate.setName(template.attr("name"));
        // ???
        Elements selectElement = template.select("elements").first().children();
        for (Element element : selectElement) {
            if ("element".equals(element.tagName())) {
                AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element);
                extractTemplate.addCssSelector(selector);
            } else if ("if".equals(element.tagName())) {
                IFConditions ifConditions = IFConditions.create(element);
                extractTemplate.addConditions(ifConditions);
            }
        }
        super.setExtractConfig(this);
        this.templates.add(extractTemplate);
    }
    //super.setExtractConfig(this);
    return this;
}

From source file:org.apache.archiva.web.docs.RestDocsServlet.java

@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {

    logger.debug("docs request to path: {}", req.getPathInfo());

    String path = StringUtils.removeStart(req.getPathInfo(), "/");
    InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(path);

    if (StringUtils.endsWith(path, ".xsd")) {
        StringEscapeUtils.escapeXml(resp.getWriter(), IOUtils.toString(is));
        //IOUtils.copy( is, resp.getOutputStream() );
        return;//from w  w  w  .  j  ava2 s  .com
    }

    String startPath = StringUtils.substringBefore(path, "/");

    // replace all links !!
    Document document = Jsoup.parse(is, "UTF-8", "");

    Element body = document.body().child(0);

    Elements links = body.select("a[href]");

    for (Element link : links) {
        link.attr("href", "#" + startPath + "/" + link.attr("href"));
    }

    Elements datalinks = body.select("[data-href]");

    for (Element link : datalinks) {
        link.attr("data-href", "#" + startPath + "/" + link.attr("data-href"));
    }

    Elements codes = body.select("code");

    for (Element code : codes) {
        code.attr("class", code.attr("class") + " nice-code");
    }

    //default generated enunciate use h1/h2/h3 which is quite big so transform to h3/h4/h5

    Elements headers = body.select("h1");

    for (Element header : headers) {
        header.tagName("h3");
    }

    headers = body.select("h2");

    for (Element header : headers) {
        header.tagName("h4");
    }

    headers = body.select("h3");

    for (Element header : headers) {
        header.tagName("h5");
    }

    Document res = new Document("");
    res.appendChild(body.select("div[id=main]").first());

    Elements scripts = body.select("script");
    for (Element script : scripts) {
        res.appendChild(script);
    }
    resp.getOutputStream().write(res.outerHtml().getBytes());

}

From source file:org.apdplat.superword.extract.SentenceExtractor.java

public static Map<String, String> parse(String html) {
    Map<String, String> sentences = new HashMap<>();
    try {/*from  w  ww .  j  a v a 2 s.c  o m*/
        for (Element element : Jsoup.parse(html).select(SENTENCE_CSS_PATH)) {
            String en = null;
            String cn = null;
            Elements elements = element.select(EN_CSS_PATH);
            if (elements.size() == 1) {
                en = elements.get(0).text().trim();
                LOGGER.info("???:" + en);
                if (en.split("\\s+").length < 2) {
                    LOGGER.debug("???");
                    continue;
                }
            }
            elements = element.select(CN_CSS_PATH);
            if (elements.size() == 1) {
                cn = elements.get(0).text().trim();
                LOGGER.info("???:" + cn);
            }
            if (StringUtils.isNotBlank(en) && StringUtils.isNotBlank(cn)) {
                sentences.put(en, cn);
                //?
                TextAnalyzer.seg(en).forEach(w -> {
                    Word word = new Word(w, "");
                    WORD_FREQUENCE.putIfAbsent(word, new AtomicInteger());
                    WORD_FREQUENCE.get(word).incrementAndGet();
                });
            }
        }
    } catch (Exception e) {
        LOGGER.error("???", e);
    }
    return sentences;
}

From source file:org.apdplat.superword.extract.SynonymAntonymExtractor.java

/**
 * ????/*w  w w.  j  ava2s  .c o m*/
 * @param html
 * @return
 */
public static SynonymAntonym parseSynonymAntonym(String html, String word) {
    SynonymAntonym synonymAntonym = new SynonymAntonym();
    synonymAntonym.setWord(new Word(word, ""));
    try {
        for (Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)) {
            String type = element.select(TYPE).text().trim();
            LOGGER.debug("type:" + type);
            Elements elements = element.select(WORDS);
            for (Element ele : elements) {
                String w = ele.text().trim();
                LOGGER.debug("word:" + w);
                if (StringUtils.isNotBlank(w)) {
                    switch (type) {
                    case "??":
                        synonymAntonym.addSynonym(new Word(w, ""));
                        break;
                    case "???":
                        synonymAntonym.addAntonym(new Word(w, ""));
                        break;
                    default:
                        LOGGER.error("???????" + type);
                    }
                } else {
                    LOGGER.error("??????" + word);
                }
            }
        }
        LOGGER.info("??????" + synonymAntonym);
    } catch (Exception e) {
        LOGGER.error("??????", e);
    }
    return synonymAntonym;
}

From source file:org.apdplat.superword.tools.Definition.java

public static List<String> parseDefinitionForWebster(String html, String cssPath) {
    List<String> list = new ArrayList<>();
    try {//from w ww. j  a  va 2s.  co  m
        for (Element element : Jsoup.parse(html)
                .select("div.tense-box.quick-def-box.simple-def-box.card-box.def-text div.inner-box-wrapper")) {
            StringBuilder definition = new StringBuilder();
            String partOfSpeech = element.select("div.word-attributes span.main-attr em").text().trim();
            for (Element defElement : element.select(
                    "div.definition-block.def-text ul.definition-list.no-count li p.definition-inner-item span")) {
                String def = defElement.text().trim();
                if (def.length() < 3) {
                    continue;
                }
                if (Character.isAlphabetic(def.charAt(0))) {
                    def = ": " + def;
                } else {
                    int index = 0;
                    while (!Character.isAlphabetic(def.charAt(++index))) {
                        //
                    }
                    def = ": " + def.substring(index);
                }
                definition.append(partOfSpeech).append(" ").append(def);
                list.add(definition.toString());
                definition.setLength(0);
            }
        }
    } catch (Exception e) {
        LOGGER.error("?", e);
    }
    return list;
}

From source file:org.apdplat.superword.tools.Definition.java

public static List<String> parseDefinitionForOxford(String html, String cssPath) {
    List<String> list = new ArrayList<>();
    try {/*w  w w  .  j  a v  a  2  s . c o m*/
        for (Element element : Jsoup.parse(html).select("section.se1.senseGroup")) {
            StringBuilder definition = new StringBuilder();
            String partOfSpeech = element.select("span.partOfSpeech").text().trim();
            for (Element defElement : element.select("div.senseInnerWrapper")) {
                String seq = defElement.select("span.iteration").text().trim();
                String def = defElement.select("span.definition").text().trim();
                if (def.endsWith(":")) {
                    def = def.substring(0, def.length() - 1);
                }
                definition.append(partOfSpeech).append(" ").append(seq).append(" ").append(def);
                list.add(definition.toString());
                definition.setLength(0);
            }
        }
    } catch (Exception e) {
        LOGGER.error("?", e);
    }
    return list;
}