Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void useJsoup() {

    String homeDir = System.getProperty("user.home");

    System.out.println(homeDir);/*  ww  w.j av a2  s .  co m*/

    //JSOUP API allows to extract all  elements of letters in files

    // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml");

    File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html");

    try {
        Document doc = Jsoup.parse(input, "UTF-8");

        List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields

        String previousYear = "";

        for (Element element : doc.getElementsByClass("section")) {
            Letter letter = new Letter();

            StringBuilder content = new StringBuilder();

            for (Element child : element.children()) {

                for (Attribute att : child.attributes()) {
                    System.out.println(att.getKey() + " " + att.getValue());
                }

                if ("center".equalsIgnoreCase(child.className())) {
                    String toWhom = child.getElementsByTag("strong").text();

                    if (StringUtils.isEmpty(toWhom)) {
                        toWhom = child.text();
                        // System.out.println(toWhom);
                    }

                    String[] toWhomArray = toWhom.split("(\\s\\s)|(,)");

                    for (String to : toWhomArray) {
                        RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content
                    }

                    //check if there is anything else here and find date and place - it will be replaced if exists below

                    String entireText = child.text();

                    String tail = entireText.replace(toWhom, "");

                    if (StringUtils.isNotEmpty(tail)) {
                        RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present
                    }

                    // System.out.println("two whom\t " +  child.getElementsByTag("strong").text() );

                } else if ("Data".equalsIgnoreCase(child.className())) {

                    if (child.getElementsByTag("em") != null
                            && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) {
                        RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(),
                                previousYear); //most often date and place are enclosed in em tag

                        if (letter.getDate() != null) {
                            LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault())
                                    .toLocalDate();
                            int year = localDate.getYear();
                            previousYear = year + "";
                        }
                    }

                    // System.out.println("when and where\t " + child.getElementsByTag("em").text());

                } else if ("petit".equalsIgnoreCase(child.className())
                        || "Textpetit_otstup".equalsIgnoreCase(child.className())) {
                    letter.getNotes().add(child.text());

                } else {
                    //System.out.println(child.text() );

                    Elements elements = child.getElementsByTag("sup");

                    for (Element e : elements) {
                        String value = e.text();

                        e.replaceWith(new TextNode("[" + value + "]", null));
                    }

                    for (Element el : child.getAllElements()) {
                        // System.out.println(el.tagName());
                        if ("sup".equalsIgnoreCase(el.tagName())) {
                            content.append(" [" + el.text() + "] ");
                        } else {
                            content.append(el.text());
                        }

                    }

                    content.append("\n");

                }

                //                  System.out.println(child.tag() + "\n" );
                //                  System.out.println(child.outerHtml() + "\n" + child.text());
            }

            letter.setContent(content.toString());
            letters.add(letter);
        }

        ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter();

        for (Letter letter : letters) {
            //                if (letter.getDate() == null)
            //                {

            //                        if (StringUtils.isNotEmpty(person.getLastName()))
            //                        {
            String json = ow.writeValueAsString(letter);

            System.out.println(json);
            //                        }

            //}

        }

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:us.colloquy.util.DiaryParser.java

@Test
    public void useJsoup() {
        //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml");
        //   File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml");

        File input = new File(System.getProperty("user.home")
                + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml");

        String previousYear = "";

        String sourse = "pointer";

        List<DiaryEntry> diaryEntrys = new ArrayList<>();

        try {/*from ww  w. j  av a 2s . c o  m*/
            Document doc = Jsoup.parse(input, "UTF-8");

            for (Element element : doc.getElementsByClass("section")) {
                DiaryEntry diaryEntry = null;

                StringBuilder contentBuilder = new StringBuilder();

                for (Element child : element.children()) {
                    //                    for (Attribute att : child.attributes())
                    //                    {
                    //                        //   System.out.println(att.getKey() + " " + att.getValue());
                    //                    }
                    //we need to assume that each element is a continuation unless the entry is a date that starts a new entry
                    //the problem is to distinguish between an entry that contains date and place vs date within an entry

                    //lets try to see if element is a date

                    DiaryEntry diaryEntryToCollectDate = new DiaryEntry();

                    //we send it in two cases when text matches year or when text has em element
                    Element em = child.select("em").first();

                    if (em == null && StringUtils.isNotEmpty(child.text())) {
                        Matcher m = yearPattern.matcher(child.text());

                        if (m.find()) {
                            child.text(m.group(1));
                            previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                        }
                    }

                    if (em != null) {
                        previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                    }

                    if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry
                    {
                        System.out.println("Found date: " + diaryEntryToCollectDate.getDate());
                        //create new DiaryEntry
                        if (diaryEntry != null) {
                            diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here
                            diaryEntrys.add(diaryEntry);
                        }

                        diaryEntry = new DiaryEntry();
                        diaryEntry.setSource(sourse);
                        diaryEntry.setDate(diaryEntryToCollectDate.getDate());
                        diaryEntry.setPlace(diaryEntryToCollectDate.getPlace());

                        contentBuilder = new StringBuilder();

                    }

                    if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) {
                        contentBuilder.append(child.text() + "\n");

                    }
                    //
                    //                    System.out.println(child.tag() + "\n");
                    //                    System.out.println(child.outerHtml() + "\n" + child.text());
                }

                //whatever we still have, add here:
                if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) {
                    diaryEntry.setEntry(contentBuilder.toString());
                    diaryEntrys.add(diaryEntry);
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }

        for (DiaryEntry diaryEntry : diaryEntrys) {
            System.out.println(diaryEntry.toString());
        }
    }

From source file:us.colloquy.util.DiaryParser.java

private static void replaceSupTag(Element child) {
        Elements elements = child.getElementsByTag("sup");

        for (Element e : elements) {
            String value = e.text();

            e.replaceWith(new TextNode("[" + value + "]", null));
        }/*from   www .j av  a  2s  . co  m*/

    }

From source file:us.colloquy.util.EpubExtractor.java

public static void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory,
        boolean useOnlyNumber) {

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth,
            (path, attr) -> String.valueOf(path).endsWith(".ncx"))) {
        stream.forEach(results::add);//from w  w  w .  j  a v  a2 s  .  co  m

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            for (Element element : doc.getElementsByTag("avantitul")) {

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches(
                                "  ? ? .*")) {
                            System.out.println("------------------   " + label);
                        }
                    }
                }

            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------ " + "?" + " -------------------");

                        } else if (label.contains(" ?")) {
                            break;
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?A-Za-z]+.*") && StringUtils.isNotEmpty(url)) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url)
                                && useOnlyNumber) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //        System.out.println("Size: " + uriList.size());

    //        for (DocumentPointer pointer : uriList)
    //        {
    //            //parse and
    //            System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    //        }
}

From source file:webscrap.WebScrap.java

/**
 * @param args the command line arguments
 *//*from w w w  .  ja  v  a2  s  .c  o  m*/
public static void main(String[] args) {
    // TODO code application logic here
    Document doc;
    try {
        doc = Jsoup.connect(
                "http://www.metmuseum.org/collection/the-collection-online/search/15538?pos=1&rpp=30&pg=1&rndkey=20150122&ft=*&deptids=2")
                .get();

        File jsonFile = new File("Records.json");
        FileWriter output = new FileWriter(jsonFile);
        JSONArray store = new JSONArray();
        //Declarations for JSON output
        String nameTag = "Name";
        String name;
        String artistTag = "Artist";
        String artistName;
        String imgURLTag = "imgURL";
        String imgsrc;
        String dateTag = "Date";
        String date;
        String geoTag = "Geography";
        String geoVal;
        String cultureTag = "Culture";
        String culture;
        String mediumTag = "Medium";
        String medium;
        String dimTag = "Dimension";
        String dim;
        String classTag = "Classification";
        String classification;
        String credit_line_tag = "Credit_Line";
        String credit_line;
        String accessNumTag = "Accession_Number";
        String accessNum;
        String RnRTag = "Rights_and_Reproduction";
        String RnR;

        //trying to load the next urls
        String next = "http://www.metmuseum.org/collection/the-collection-online/search/11432?pos=1&rpp=30&pg=1&rndkey=20150123&ft=*&deptids=2";
        int i = 500;
        while (i != 0) {

            name = "";
            artistName = "";
            imgsrc = "";
            date = "";
            //geoVal = "not available";
            //culture = "not available";
            medium = "";
            dim = "";
            classification = "";
            credit_line = "";
            accessNum = "";
            //RnR = "not available";

            doc = Jsoup.connect(next).get();
            String o_title = doc.getElementsByTag("h2").text();
            String[] part_o = o_title.split("Email");
            String part_o1 = part_o[0];
            String part_o2 = part_o[1];
            //System.out.println(o_title);
            name = part_o1;
            //String artist = doc.getElementsByTag("h3").text();
            //System.out.println(artist);
            //artistName = artist;
            Elements imgdiv = doc.select("div#inner-image-container img");
            for (Element e : imgdiv) {
                imgsrc = e.absUrl("src");
            }

            Elements divs;
            divs = doc.select("div.tombstone");
            Elements divchild;
            divchild = divs.select("div");
            int count = 0;
            for (Element div : divchild) {
                String info = div.text();
                if (count != 0) {
                    String[] parts = info.split(":");
                    String part1 = parts[0];
                    String part2 = parts[1];

                    switch (part1) {
                    case "Artist":
                        artistName = part2;
                        break;
                    case "Date":
                        date = part2;
                        break;
                    case "Geography":
                        geoVal = part2;
                        break;
                    case "Culture":
                        culture = part2;
                        break;
                    case "Medium":
                        medium = part2;
                        break;
                    case "Dimensions":
                        dim = part2;
                        break;
                    case "Classification":
                        classification = part2;
                        break;
                    case "Credit Line":
                        credit_line = part2;
                        break;
                    case "Accession Number":
                        accessNum = part2;
                        break;
                    case "Rights and Reproduction":
                        RnR = part2;
                        break;
                    }
                }
                count++;
            }
            if (classification.equals(" Paintings")) {
                //System.out.println(nameTag+name);
                //System.out.println(artistTag+artistName);
                //System.out.println(imgURLTag+imgsrc);
                //System.out.println(dateTag+date);
                //System.out.println(mediumTag+medium);
                //System.out.println(dimTag+dim);
                //System.out.println(classTag+classification);
                //System.out.println(credit_line_tag+credit_line);
                //System.out.println(accessNumTag+accessNum);
                //System.out.println(i);
                //json writing
                JSONObject jsonObj = new JSONObject();
                jsonObj.put(nameTag, name);
                jsonObj.put(artistTag, artistName);
                jsonObj.put(imgURLTag, imgsrc);
                jsonObj.put(dateTag, date);
                jsonObj.put(mediumTag, medium);
                jsonObj.put(dimTag, dim);
                jsonObj.put(classTag, classification);
                jsonObj.put(credit_line_tag, credit_line);
                jsonObj.put(accessNumTag, accessNum);

                store.add(jsonObj);
                i--;
            }
            //going to next page      
            Element link = doc.select("a.next").first();
            next = link.attr("abs:href");

        }
        output.write(store.toJSONString());
        output.write("\n");
        output.flush();
        output.close();

    } catch (IOException e) {
    }

}

From source file:wo.trade.SearchPageScraper.java

public List<TradeItem> parse() {
    List<TradeItem> tradeItems = new LinkedList<>();
    Document doc = Jsoup.parse(page, "UTF-8");

    Element content = doc.getElementById("content");

    Elements items = null;//from  ww w.j  av  a2 s  .  c  om
    if (content == null) {
        items = doc.getElementsByClass("item");
    } else {
        items = content.getElementsByClass("item");
    }

    for (Element element : items) {

        TradeItem item = new TradeItem();

        item.id = element.attr("id");
        item.id = StringUtils.remove(item.id, "item-container-");
        item.seller = element.attr("data-seller");
        item.thread = element.attr("data-thread");
        item.sellerid = element.attr("data-sellerid");
        item.buyout = element.attr("data-buyout");
        item.ign = element.attr("data-ign");
        item.league = element.attr("data-league");
        item.name = element.attr("data-name");
        item.corrupted = element.getElementsByClass("corrupted").size() > 0;
        item.identified = element.getElementsByClass("item-unid").size() == 0;

        //         System.out.println(String.format("Now parsing item id %s name %s", item.id, item.name));

        Element sockElem = element.getElementsByClass("sockets-raw").get(0);
        item.socketsRaw = sockElem.text();

        Elements accntAgeElement = element.getElementsByAttributeValue("title",
                "account age and highest level");
        if (accntAgeElement != null && !accntAgeElement.isEmpty()) {
            item.ageAndHighLvl = accntAgeElement.get(0).text();
        }

        // ----- Requirements ----- //
        Element reqElem = element.getElementsByClass("requirements").get(0);
        List<TextNode> reqNodes = reqElem.textNodes();
        for (TextNode reqNode : reqNodes) {
            // sample [ Level:&nbsp;37 ,  Strength:&nbsp;42 ,  Intelligence:&nbsp;42 ] 
            String req = StringUtils.trimToEmpty(reqNode.getWholeText());
            req = req.replaceAll(regex_horizontal_whitespace, "");
            req = Util.removeThoseDamnWhiteSpace(req);
            String separator = ":";
            String reqType = trim(substringBefore(req, separator));
            switch (reqType) {
            case "Level":
                item.reqLvl = trim(substringAfter(req, separator));
                break;
            case "Strength":
                item.reqStr = trim(substringAfter(req, separator));
                break;
            case "Intelligence":
                item.reqInt = trim(substringAfter(req, separator));
                break;
            case "Dexterity":
                item.reqDex = trim(substringAfter(req, separator));
                break;
            }
        }
        item.mapQuantity = element.getElementsByAttributeValue("data-name", "mapq").stream().findFirst()
                .map(n -> n.text()).map(s -> substringAfter(s, "Item quantity:"))
                .map(s -> StringUtils.removePattern(s, "[^\\d]")).orElse("")
                .replaceAll(regex_horizontal_whitespace, "").trim();

        // ----- Rarity by checking the item name link class ----- //
        // itemframe0 - normal
        // itemframe1 - magic
        // itemframe2 - rare
        // itemframe3 - unique
        // itemframe4 - gems
        // itemframe5 - currency
        // itemframe6 - divination card
        String itemframeStr = element.getElementsByClass("title").stream().findFirst().map(n -> n.attr("class"))
                .orElse(null);
        itemframeStr = Util.regexMatch("itemframe(\\d)", itemframeStr, 1);
        if (itemframeStr != null) {
            int frame = Integer.parseInt(itemframeStr);
            item.rarity = Rarity.valueOf(frame);
        } else {
            item.rarity = Rarity.unknown;
        }

        // ----- Verify ----- //
        item.dataHash = element.getElementsByAttributeValue("onclick", "verify_modern(this)").stream()
                .findFirst().map(n -> n.attr("data-hash")).orElse("").trim();

        // ----- Mods ----- //
        Elements itemModsElements = element.getElementsByClass("item-mods");
        if (itemModsElements != null && itemModsElements.size() > 0) {
            Element itemMods = itemModsElements.get(0);
            if (itemMods.getElementsByClass("bullet-item").size() != 0) {
                Element bulletItem = itemMods.getElementsByClass("bullet-item").get(0);
                Elements ulMods = bulletItem.getElementsByTag("ul");
                if (ulMods.size() == 2) {
                    // implicit mod
                    Elements implicitLIs = ulMods.get(0).getElementsByTag("li");
                    Element implicitLi = implicitLIs.last();
                    Mod impMod = new Mod(implicitLi.attr("data-name"), implicitLi.attr("data-value"));
                    item.implicitMod = impMod;
                }
                int indexOfExplicitMods = ulMods.size() - 1;
                Elements modsLi = ulMods.get(indexOfExplicitMods).getElementsByTag("li");
                for (Element modLi : modsLi) {
                    // explicit mods
                    Mod mod = new Mod(modLi.attr("data-name"), modLi.attr("data-value"));
                    item.explicitMods.add(mod);
                }
            }
        }

        // ----- Properties ----- //
        // this is the third column data (the first col is the image, second is the mods, reqs)
        item.quality = element.getElementsByAttributeValue("data-name", "q").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.physDmgRangeAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pd").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.eleDmgRange = element.getElementsByAttributeValue("data-name", "ed").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.attackSpeed = element.getElementsByAttributeValue("data-name", "aps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.dmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_dps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.physDmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pdps").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.eleDmg = element.getElementsByAttributeValue("data-name", "edps").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.armourAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_armour").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.evasionAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_evasion").get(0)
                .text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.energyShieldAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_shield")
                .get(0).text().replaceAll(regex_horizontal_whitespace, "").trim();
        item.block = element.getElementsByAttributeValue("data-name", "block").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.crit = element.getElementsByAttributeValue("data-name", "crit").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.level = element.getElementsByAttributeValue("data-name", "level").get(0).text()
                .replaceAll(regex_horizontal_whitespace, "").trim();
        item.imageUrl = element.getElementsByAttributeValue("alt", "Item icon").get(0).attr("src");
        item.stackSize = asList(split(trimToEmpty(item.imageUrl), '&')).stream()
                .filter(t -> t.startsWith("stackSize=")).findFirst().map(s -> substringAfter(s, "="))
                .orElse(null);

        Elements onlineSpans = element.getElementsMatchingText("online");
        if (!onlineSpans.isEmpty()) {
            item.online = "Online";
        } else {
            item.online = "";
        }

        tradeItems.add(item);
    }
    //      System.out.println("DONE --- Items");

    return tradeItems;
}

From source file:xxx.web.comments.debates.impl.ProConOrgCommentsParser.java

/**
 * Extracts the document of the quote// w  ww .  j  a va2s .  c o m
 *
 * @param textElement text quote element
 * @return plain string with paragraphs kept
 */
protected static String extractPlainTextFromTextElement(Element textElement) {
    StringBuilder sb = new StringBuilder();

    for (Node childNode : textElement.childNodes()) {
        if (childNode instanceof Element) {
            Element childElement = (Element) childNode;

            String tagName = childElement.tagName();

            if ("p".equals(tagName) || "span".equals(tagName)) {
                sb.append(childElement.text());
                sb.append("\n");
            } else if ("br".equals(tagName)) {
                // prevent double newlines
                sb = new StringBuilder(sb.toString().trim());
                sb.append("\n");
            }

        } else if (childNode instanceof TextNode) {
            TextNode textNode = (TextNode) childNode;

            sb.append(textNode.text());
        }
    }

    // remove leading + ending quotes
    return Utils.normalize(sb.toString()).replaceAll("[(^\")(\"$)]", "");
}

From source file:xxx.web.comments.debates.impl.ProConOrgParser.java

/**
 * Extracts the document of the quote//from  w w  w  .ja va 2  s.  c o  m
 *
 * @param textElement text quote element
 * @return plain string with paragraphs kept
 */
public static String extractPlainTextFromTextElement(Element textElement) {
    StringBuilder sb = new StringBuilder();

    for (Node childNode : textElement.childNodes()) {
        if (childNode instanceof Element) {
            Element childElement = (Element) childNode;

            String tagName = childElement.tagName();

            if ("p".equals(tagName) || "span".equals(tagName)) {
                sb.append(childElement.text());
                sb.append("\n");
            } else if ("br".equals(tagName)) {
                // prevent double newlines
                sb = new StringBuilder(sb.toString().trim());
                sb.append("\n");
            }

        } else if (childNode instanceof TextNode) {
            TextNode textNode = (TextNode) childNode;

            sb.append(textNode.text());
        }
    }

    // remove leading + ending quotes
    return Utils.normalize(sb.toString()).replaceAll("[(^\")(\"$)]", "");
}

From source file:xxx.web.comments.roomfordebate.NYTimesArticleExtractor.java

public Article extractArticle(String html) throws ParseException, IOException {
    Article result = new Article();

    Document doc = Jsoup.parse(html, getBaseName());

    Element element = doc.select("article.rfd").iterator().next();

    //      System.out.println(element);

    String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", "");
    // time//w w  w. j a  va  2s .c om
    try {
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    } catch (ParseException e) {
        // June 24, 2015
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    }

    // title
    result.setTitle(Utils.normalize(element.select("h1").text()));

    // text
    StringBuilder sb = new StringBuilder();
    for (Element p : element.select("div.nytint-post > p")) {
        sb.append(p.text());
        sb.append("\n");
    }
    result.setText(Utils.normalize(sb.toString()));

    // debate title
    result.setDebateTitle(Utils.normalize(doc.select("div.nytint-discussion-overview > h2").text()));

    // debate url
    result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href"));

    // document url
    result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content"));

    // debate description
    result.setDebateDescription(Utils.normalize(((TextNode) doc.select("div.nytint-discussion-overview > p")
            .iterator().next().childNodes().iterator().next()).text()));

    // aurhor
    result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt"));

    // topics
    for (Element a : element.select("p.nytint-tags > a")) {
        result.getTopics().add(a.attr("href"));
    }

    return result;
}