Example usage for org.jsoup.nodes Element select

List of usage examples for org.jsoup.nodes Element select

Introduction

In this page you can find the example usage for org.jsoup.nodes Element select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:scrapper.BSEScrapper.java

public void parse(String url) throws Exception {
    Element doc = Jsoup.connect(url).get();
    setSensexValue(doc.select("#ref_15173681_l").text());
    setChange(doc.select("#ref_15173681_c").text());
    setChangePercentage(doc.select("#ref_15173681_cp").text());

}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void getURIForAllDiaries() {

    Set<DocumentPointer> uriList = new HashSet<>();
    //String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries";

    //// w w  w .  ja v a  2 s. c  o  m

    String letterDirectory = System.getProperty("user.home")
            + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49";

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {

        stream.forEach(results::add);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            //  System.out.println("==========================   " + res.toString() + " ==========================");

            boolean startPrinting = false;

            boolean newFile = true;

            for (Element element : doc.getElementsByTag("navPoint")) {

                //get nav label and content

                Element navLabelElement = element.select("navLabel").first();
                Element srsElement = element.select("content").first();

                String navLabel = "";
                String srs = "";

                if (navLabelElement != null) {
                    navLabel = navLabelElement.text().replaceAll("\\*", "").trim();
                }

                if (srsElement != null) {
                    srs = srsElement.attr("src");
                }

                if ("??".matches(navLabel))

                {
                    startPrinting = false;

                    // System.out.println("----------------- end of file pointer ---------------");
                }

                if (StringUtils.isNotEmpty(navLabel)
                        && navLabel.matches("??.*|?? ?.*") && newFile) {
                    newFile = false;
                    startPrinting = true;
                    title = navLabel;
                }

                if (startPrinting) {
                    // System.out.println("----------------- file pointer ---------------");
                    //   System.out.println(navLabel + "\t" + srs);

                    DocumentPointer documentPointer = new DocumentPointer(
                            parent.toString() + File.separator + srs.replaceAll("#.*", ""), title);

                    uriList.add(documentPointer);
                }

                //                    for (Element child : element.children())
                //                    {
                //                        String label = child.text();
                //
                //                        if (StringUtils.isNotEmpty(label))
                //                        {
                //                            if (label.matches("??\\s\\d{4}.*"))
                //                            {
                //                                System.out.println("------------------");
                //                            }

                //
                //                            String url = child.getElementsByTag("content").attr("src");
                //
                //                            if (label.matches(".*\\d{1,3}.*[?--?]+.*") &&
                //                                    StringUtils.isNotEmpty(url))
                //                            {
                //                                DocumentPointer letterPointer = new DocumentPointer(parent.toString()
                //                                        + File.separator + url.replaceAll("#.*", ""), title);
                //
                //                                uriList.add(letterPointer);
                ////                                System.out.println("nav point: " + label + " src " + parent.toString()
                ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
                //
                //
                //                            } else if (label.matches(".*\\d{1,3}.*") &&
                //                                    StringUtils.isNotEmpty(url) && useOnlyNumber)
                //                            {
                //                                DocumentPointer letterPointer = new DocumentPointer(parent.toString()
                //                                        + File.separator + url.replaceAll("#.*", ""), title);
                //
                //                                uriList.add(letterPointer);
                ////                                System.out.println("nav point: " + label + " src " + parent.toString()
                ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
                //
                //
                //                            } else
                //                            {
                //                                // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                //                            }
                //
                //
                //                        }
                //                        }
            }

            //   System.out.println("==========================   END OF FILE ==========================");

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (DocumentPointer pointer : uriList) {
        //parse and
        System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    }
}

From source file:us.colloquy.util.DiaryParser.java

@Test
    public void useJsoup() {
        //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml");
        //   File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml");

        File input = new File(System.getProperty("user.home")
                + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml");

        String previousYear = "";

        String sourse = "pointer";

        List<DiaryEntry> diaryEntrys = new ArrayList<>();

        try {//from w  w  w .ja v  a  2 s .  com
            Document doc = Jsoup.parse(input, "UTF-8");

            for (Element element : doc.getElementsByClass("section")) {
                DiaryEntry diaryEntry = null;

                StringBuilder contentBuilder = new StringBuilder();

                for (Element child : element.children()) {
                    //                    for (Attribute att : child.attributes())
                    //                    {
                    //                        //   System.out.println(att.getKey() + " " + att.getValue());
                    //                    }
                    //we need to assume that each element is a continuation unless the entry is a date that starts a new entry
                    //the problem is to distinguish between an entry that contains date and place vs date within an entry

                    //lets try to see if element is a date

                    DiaryEntry diaryEntryToCollectDate = new DiaryEntry();

                    //we send it in two cases when text matches year or when text has em element
                    Element em = child.select("em").first();

                    if (em == null && StringUtils.isNotEmpty(child.text())) {
                        Matcher m = yearPattern.matcher(child.text());

                        if (m.find()) {
                            child.text(m.group(1));
                            previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                        }
                    }

                    if (em != null) {
                        previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                    }

                    if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry
                    {
                        System.out.println("Found date: " + diaryEntryToCollectDate.getDate());
                        //create new DiaryEntry
                        if (diaryEntry != null) {
                            diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here
                            diaryEntrys.add(diaryEntry);
                        }

                        diaryEntry = new DiaryEntry();
                        diaryEntry.setSource(sourse);
                        diaryEntry.setDate(diaryEntryToCollectDate.getDate());
                        diaryEntry.setPlace(diaryEntryToCollectDate.getPlace());

                        contentBuilder = new StringBuilder();

                    }

                    if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) {
                        contentBuilder.append(child.text() + "\n");

                    }
                    //
                    //                    System.out.println(child.tag() + "\n");
                    //                    System.out.println(child.outerHtml() + "\n" + child.text());
                }

                //whatever we still have, add here:
                if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) {
                    diaryEntry.setEntry(contentBuilder.toString());
                    diaryEntrys.add(diaryEntry);
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }

        for (DiaryEntry diaryEntry : diaryEntrys) {
            System.out.println(diaryEntry.toString());
        }
    }

From source file:xxx.web.comments.debates.impl.ProConOrgCommentsParser.java

@Override
public Debate parseDebate(InputStream inputStream) throws IOException {
    Debate result = new Debate();

    Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/");

    // Set the Url of the doc

    // title/*ww w.jav  a2 s .  c o  m*/
    Element body = doc.body();

    Map<String, Elements> proConElements = new HashMap<>();
    proConElements.put("pro", body.select("div[class=column pro]"));
    proConElements.put("con", body.select("div[class=column con]"));

    //        Elements pro = body.select("div[class=column pro]");
    //        System.out.println(pro);

    // title
    result.setTitle(Utils.normalize(body.select("h2").text()));

    for (Map.Entry<String, Elements> entry : proConElements.entrySet()) {
        // stance
        String stance = entry.getKey();

        Elements comments = entry.getValue().select("ul.comments > li[class^=comment]");

        for (Element element : comments) {

            Element divContent = element.select("div.contents").iterator().next();

            // extract argument content
            Argument argument = extractArgumentFromDivContent(divContent);
            // extract ID
            String parentId = element.attr("id").replace(":", "_");

            if (parentId == null) {
                throw new IllegalStateException("Parent id must be known");
            }

            argument.setId(parentId);
            // set stance - we know it
            argument.setStance(stance);

            result.getArgumentList().add(argument);

            Elements divReplies = element.select("li[class^=reply]");

            //                System.out.println(divReplies.size());

            for (Element divReply : divReplies) {

                Element replyDivContent = divReply.select("div.contents").iterator().next();

                // extract reply argument
                Argument replyArgument = extractArgumentFromDivContent(replyDivContent);

                // set id and parentId
                String id = element.attr("id").replace(":", "_");
                replyArgument.setId(id);

                if (id == null) {
                    throw new IllegalStateException("Id must be known");
                }

                replyArgument.setParentId(parentId);

                // add to debate
                result.getArgumentList().add(replyArgument);
            }

        }
    }

    return result;
}

From source file:xxx.web.comments.debates.impl.ProConOrgCommentsParser.java

protected static Argument extractArgumentFromDivContent(Element divContent) {
    Argument argument = new Argument();

    Element blockquote = divContent.select("blockquote").iterator().next();
    //        System.out.println("----------");

    String text = ProConOrgParser.extractPlainTextFromTextElement(blockquote);
    argument.setText(text);//from w w  w. ja  v a 2s  .  c om

    String votesUpText = divContent.select("span.votes-up").text();
    String votesDownText = divContent.select("span.votes-down").text();

    int votesUp = Integer.valueOf(votesUpText);
    int votesDown = Integer.valueOf(votesDownText);

    argument.setVoteUpCount(votesUp);
    argument.setVoteDownCount(Math.abs(votesDown));

    argument.setAuthor(divContent.select("span.name").text());

    // time
    DateFormat df = new SimpleDateFormat("MMM. dd, yyyy", Locale.ENGLISH);
    String dateText = divContent.select("span.date").text();
    try {
        Date date = df.parse(dateText);
        argument.setTimestamp(date);
    } catch (ParseException e) {
        // e.printStackTrace();
    }

    //        System.out.println(argument);

    return argument;
}

From source file:xxx.web.comments.debates.impl.ProConOrgParser.java

@Override
public Debate parseDebate(InputStream inputStream) throws IOException {
    Debate result = new Debate();

    Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/");

    // Set the Url of the doc

    // title//from w  ww  .  j  a va 2 s.c om
    Element body = doc.body();
    Elements debateTitleElements = body.select("h2");
    //        Elements debateTitleElements = body.select("p[class=title]").select("p[style]");

    if (debateTitleElements.first() == null) {
        // not a debate
        return null;
    }

    String title = Utils.normalize(debateTitleElements.first().text());
    result.setTitle(title);

    Elements proConTr = body.select("tr > td > b:contains(PRO \\(yes\\))");

    if (proConTr == null || proConTr.parents() == null || proConTr.parents().first() == null
            || proConTr.parents().first().parents() == null
            || proConTr.parents().first().parents().first() == null
            || proConTr.parents().first().parents().first().nextElementSibling() == null) {
        // not a pro-con debate
        return null;
    }

    Element trAnswers = proConTr.parents().first().parents().first().nextElementSibling();

    // the PRO side
    Element proTd = trAnswers.select("td").get(0);
    Element conTd = trAnswers.select("td").get(1);

    //        System.out.println(proTd.select("blockquote").size());
    //        System.out.println(conTd.select("blockquote").size());

    for (Element text : proTd.select("blockquote > div[class=editortext]")) {
        Argument argument = new Argument();
        argument.setStance("pro");
        argument.setText(extractPlainTextFromTextElement(text));
        argument.setOriginalHTML(text.html());

        // set ID
        idCounter++;
        argument.setId("pcq_" + idCounter);

        if (!argument.getText().isEmpty()) {
            result.getArgumentList().add(argument);
        } else {
            System.err.println("Failed to extract text from " + text.html());
        }
    }

    for (Element text : conTd.select("blockquote > div[class=editortext]")) {
        Argument argument = new Argument();
        argument.setStance("con");
        argument.setText(extractPlainTextFromTextElement(text));
        argument.setOriginalHTML(text.html());

        idCounter++;
        argument.setId("pcq_" + idCounter);

        if (!argument.getText().isEmpty()) {
            result.getArgumentList().add(argument);
        } else {
            System.err.println("Failed to extract text from " + text.html());
        }
    }

    // show some stats:
    Map<String, Integer> map = new HashMap<>();
    map.put("pro", 0);
    map.put("con", 0);
    for (Argument argument : result.getArgumentList()) {
        map.put(argument.getStance(), map.get(argument.getStance()) + 1);
    }
    System.out.println(map);

    return result;
}

From source file:xxx.web.comments.roomfordebate.NYTimesArticleExtractor.java

public Article extractArticle(String html) throws ParseException, IOException {
    Article result = new Article();

    Document doc = Jsoup.parse(html, getBaseName());

    Element element = doc.select("article.rfd").iterator().next();

    //      System.out.println(element);

    String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", "");
    // time//  ww  w.j  a  v  a 2 s . co  m
    try {
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    } catch (ParseException e) {
        // June 24, 2015
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    }

    // title
    result.setTitle(Utils.normalize(element.select("h1").text()));

    // text
    StringBuilder sb = new StringBuilder();
    for (Element p : element.select("div.nytint-post > p")) {
        sb.append(p.text());
        sb.append("\n");
    }
    result.setText(Utils.normalize(sb.toString()));

    // debate title
    result.setDebateTitle(Utils.normalize(doc.select("div.nytint-discussion-overview > h2").text()));

    // debate url
    result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href"));

    // document url
    result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content"));

    // debate description
    result.setDebateDescription(Utils.normalize(((TextNode) doc.select("div.nytint-discussion-overview > p")
            .iterator().next().childNodes().iterator().next()).text()));

    // aurhor
    result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt"));

    // topics
    for (Element a : element.select("p.nytint-tags > a")) {
        result.getTopics().add(a.attr("href"));
    }

    return result;
}