List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void useJsoup() { String homeDir = System.getProperty("user.home"); System.out.println(homeDir);/* ww w.j av a2 s . co m*/ //JSOUP API allows to extract all elements of letters in files // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html"); try { Document doc = Jsoup.parse(input, "UTF-8"); List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields String previousYear = ""; for (Element element : doc.getElementsByClass("section")) { Letter letter = new Letter(); StringBuilder content = new StringBuilder(); for (Element child : element.children()) { for (Attribute att : child.attributes()) { System.out.println(att.getKey() + " " + att.getValue()); } if ("center".equalsIgnoreCase(child.className())) { String toWhom = child.getElementsByTag("strong").text(); if (StringUtils.isEmpty(toWhom)) { toWhom = child.text(); // System.out.println(toWhom); } String[] toWhomArray = toWhom.split("(\\s\\s)|(,)"); for (String to : toWhomArray) { RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content } //check if there is anything else here and find date and place - it will be replaced if exists below String entireText = child.text(); String tail = entireText.replace(toWhom, ""); if (StringUtils.isNotEmpty(tail)) { RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present } // System.out.println("two whom\t " + child.getElementsByTag("strong").text() ); } else if ("Data".equalsIgnoreCase(child.className())) { if (child.getElementsByTag("em") != null && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) { RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(), previousYear); //most often date and place are enclosed in em tag if (letter.getDate() != null) { LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault()) .toLocalDate(); int year = localDate.getYear(); previousYear = year + ""; } } // System.out.println("when and where\t " + child.getElementsByTag("em").text()); } else if ("petit".equalsIgnoreCase(child.className()) || "Textpetit_otstup".equalsIgnoreCase(child.className())) { letter.getNotes().add(child.text()); } else { //System.out.println(child.text() ); Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); } for (Element el : child.getAllElements()) { // System.out.println(el.tagName()); if ("sup".equalsIgnoreCase(el.tagName())) { content.append(" [" + el.text() + "] "); } else { content.append(el.text()); } } content.append("\n"); } // System.out.println(child.tag() + "\n" ); // System.out.println(child.outerHtml() + "\n" + child.text()); } letter.setContent(content.toString()); letters.add(letter); } ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter(); for (Letter letter : letters) { // if (letter.getDate() == null) // { // if (StringUtils.isNotEmpty(person.getLastName())) // { String json = ow.writeValueAsString(letter); System.out.println(json); // } //} } } catch (IOException e) { e.printStackTrace(); } }
From source file:us.colloquy.util.DiaryParser.java
@Test public void useJsoup() { //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml"); // File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml"); String previousYear = ""; String sourse = "pointer"; List<DiaryEntry> diaryEntrys = new ArrayList<>(); try {/*from ww w. j av a 2s . c o m*/ Document doc = Jsoup.parse(input, "UTF-8"); for (Element element : doc.getElementsByClass("section")) { DiaryEntry diaryEntry = null; StringBuilder contentBuilder = new StringBuilder(); for (Element child : element.children()) { // for (Attribute att : child.attributes()) // { // // System.out.println(att.getKey() + " " + att.getValue()); // } //we need to assume that each element is a continuation unless the entry is a date that starts a new entry //the problem is to distinguish between an entry that contains date and place vs date within an entry //lets try to see if element is a date DiaryEntry diaryEntryToCollectDate = new DiaryEntry(); //we send it in two cases when text matches year or when text has em element Element em = child.select("em").first(); if (em == null && StringUtils.isNotEmpty(child.text())) { Matcher m = yearPattern.matcher(child.text()); if (m.find()) { child.text(m.group(1)); previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child); } } if (em != null) { previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child); } if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry { System.out.println("Found date: " + diaryEntryToCollectDate.getDate()); //create new DiaryEntry if (diaryEntry != null) { diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here diaryEntrys.add(diaryEntry); } diaryEntry = new DiaryEntry(); diaryEntry.setSource(sourse); diaryEntry.setDate(diaryEntryToCollectDate.getDate()); diaryEntry.setPlace(diaryEntryToCollectDate.getPlace()); contentBuilder = new StringBuilder(); } if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) { contentBuilder.append(child.text() + "\n"); } // // System.out.println(child.tag() + "\n"); // System.out.println(child.outerHtml() + "\n" + child.text()); } //whatever we still have, add here: if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) { diaryEntry.setEntry(contentBuilder.toString()); diaryEntrys.add(diaryEntry); } } } catch (IOException e) { e.printStackTrace(); } for (DiaryEntry diaryEntry : diaryEntrys) { System.out.println(diaryEntry.toString()); } }
From source file:us.colloquy.util.DiaryParser.java
private static void replaceSupTag(Element child) { Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); }/*from www .j av a 2s . co m*/ }
From source file:us.colloquy.util.EpubExtractor.java
public static void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) { Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> String.valueOf(path).endsWith(".ncx"))) { stream.forEach(results::add);//from w w w . j a v a2 s . co m // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } for (Element element : doc.getElementsByTag("avantitul")) { for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches( " ? ? .*")) { System.out.println("------------------ " + label); } } } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------ " + "?" + " -------------------"); } else if (label.contains(" ?")) { break; } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?A-Za-z]+.*") && StringUtils.isNotEmpty(url)) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url) && useOnlyNumber) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } // System.out.println("Size: " + uriList.size()); // for (DocumentPointer pointer : uriList) // { // //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); // } }
From source file:webscrap.WebScrap.java
/** * @param args the command line arguments *//*from w w w . ja v a2 s .c o m*/ public static void main(String[] args) { // TODO code application logic here Document doc; try { doc = Jsoup.connect( "http://www.metmuseum.org/collection/the-collection-online/search/15538?pos=1&rpp=30&pg=1&rndkey=20150122&ft=*&deptids=2") .get(); File jsonFile = new File("Records.json"); FileWriter output = new FileWriter(jsonFile); JSONArray store = new JSONArray(); //Declarations for JSON output String nameTag = "Name"; String name; String artistTag = "Artist"; String artistName; String imgURLTag = "imgURL"; String imgsrc; String dateTag = "Date"; String date; String geoTag = "Geography"; String geoVal; String cultureTag = "Culture"; String culture; String mediumTag = "Medium"; String medium; String dimTag = "Dimension"; String dim; String classTag = "Classification"; String classification; String credit_line_tag = "Credit_Line"; String credit_line; String accessNumTag = "Accession_Number"; String accessNum; String RnRTag = "Rights_and_Reproduction"; String RnR; //trying to load the next urls String next = "http://www.metmuseum.org/collection/the-collection-online/search/11432?pos=1&rpp=30&pg=1&rndkey=20150123&ft=*&deptids=2"; int i = 500; while (i != 0) { name = ""; artistName = ""; imgsrc = ""; date = ""; //geoVal = "not available"; //culture = "not available"; medium = ""; dim = ""; classification = ""; credit_line = ""; accessNum = ""; //RnR = "not available"; doc = Jsoup.connect(next).get(); String o_title = doc.getElementsByTag("h2").text(); String[] part_o = o_title.split("Email"); String part_o1 = part_o[0]; String part_o2 = part_o[1]; //System.out.println(o_title); name = part_o1; //String artist = doc.getElementsByTag("h3").text(); //System.out.println(artist); //artistName = artist; Elements imgdiv = doc.select("div#inner-image-container img"); for (Element e : imgdiv) { imgsrc = e.absUrl("src"); } Elements divs; divs = doc.select("div.tombstone"); Elements divchild; divchild = divs.select("div"); int count = 0; for (Element div : divchild) { String info = div.text(); if (count != 0) { String[] parts = info.split(":"); String part1 = parts[0]; String part2 = parts[1]; switch (part1) { case "Artist": artistName = part2; break; case "Date": date = part2; break; case "Geography": geoVal = part2; break; case "Culture": culture = part2; break; case "Medium": medium = part2; break; case "Dimensions": dim = part2; break; case "Classification": classification = part2; break; case "Credit Line": credit_line = part2; break; case "Accession Number": accessNum = part2; break; case "Rights and Reproduction": RnR = part2; break; } } count++; } if (classification.equals(" Paintings")) { //System.out.println(nameTag+name); //System.out.println(artistTag+artistName); //System.out.println(imgURLTag+imgsrc); //System.out.println(dateTag+date); //System.out.println(mediumTag+medium); //System.out.println(dimTag+dim); //System.out.println(classTag+classification); //System.out.println(credit_line_tag+credit_line); //System.out.println(accessNumTag+accessNum); //System.out.println(i); //json writing JSONObject jsonObj = new JSONObject(); jsonObj.put(nameTag, name); jsonObj.put(artistTag, artistName); jsonObj.put(imgURLTag, imgsrc); jsonObj.put(dateTag, date); jsonObj.put(mediumTag, medium); jsonObj.put(dimTag, dim); jsonObj.put(classTag, classification); jsonObj.put(credit_line_tag, credit_line); jsonObj.put(accessNumTag, accessNum); store.add(jsonObj); i--; } //going to next page Element link = doc.select("a.next").first(); next = link.attr("abs:href"); } output.write(store.toJSONString()); output.write("\n"); output.flush(); output.close(); } catch (IOException e) { } }
From source file:wo.trade.SearchPageScraper.java
public List<TradeItem> parse() { List<TradeItem> tradeItems = new LinkedList<>(); Document doc = Jsoup.parse(page, "UTF-8"); Element content = doc.getElementById("content"); Elements items = null;//from ww w.j av a2 s . c om if (content == null) { items = doc.getElementsByClass("item"); } else { items = content.getElementsByClass("item"); } for (Element element : items) { TradeItem item = new TradeItem(); item.id = element.attr("id"); item.id = StringUtils.remove(item.id, "item-container-"); item.seller = element.attr("data-seller"); item.thread = element.attr("data-thread"); item.sellerid = element.attr("data-sellerid"); item.buyout = element.attr("data-buyout"); item.ign = element.attr("data-ign"); item.league = element.attr("data-league"); item.name = element.attr("data-name"); item.corrupted = element.getElementsByClass("corrupted").size() > 0; item.identified = element.getElementsByClass("item-unid").size() == 0; // System.out.println(String.format("Now parsing item id %s name %s", item.id, item.name)); Element sockElem = element.getElementsByClass("sockets-raw").get(0); item.socketsRaw = sockElem.text(); Elements accntAgeElement = element.getElementsByAttributeValue("title", "account age and highest level"); if (accntAgeElement != null && !accntAgeElement.isEmpty()) { item.ageAndHighLvl = accntAgeElement.get(0).text(); } // ----- Requirements ----- // Element reqElem = element.getElementsByClass("requirements").get(0); List<TextNode> reqNodes = reqElem.textNodes(); for (TextNode reqNode : reqNodes) { // sample [ Level: 37 , Strength: 42 , Intelligence: 42 ] String req = StringUtils.trimToEmpty(reqNode.getWholeText()); req = req.replaceAll(regex_horizontal_whitespace, ""); req = Util.removeThoseDamnWhiteSpace(req); String separator = ":"; String reqType = trim(substringBefore(req, separator)); switch (reqType) { case "Level": item.reqLvl = trim(substringAfter(req, separator)); break; case "Strength": item.reqStr = trim(substringAfter(req, separator)); break; case "Intelligence": item.reqInt = trim(substringAfter(req, separator)); break; case "Dexterity": item.reqDex = trim(substringAfter(req, separator)); break; } } item.mapQuantity = element.getElementsByAttributeValue("data-name", "mapq").stream().findFirst() .map(n -> n.text()).map(s -> substringAfter(s, "Item quantity:")) .map(s -> StringUtils.removePattern(s, "[^\\d]")).orElse("") .replaceAll(regex_horizontal_whitespace, "").trim(); // ----- Rarity by checking the item name link class ----- // // itemframe0 - normal // itemframe1 - magic // itemframe2 - rare // itemframe3 - unique // itemframe4 - gems // itemframe5 - currency // itemframe6 - divination card String itemframeStr = element.getElementsByClass("title").stream().findFirst().map(n -> n.attr("class")) .orElse(null); itemframeStr = Util.regexMatch("itemframe(\\d)", itemframeStr, 1); if (itemframeStr != null) { int frame = Integer.parseInt(itemframeStr); item.rarity = Rarity.valueOf(frame); } else { item.rarity = Rarity.unknown; } // ----- Verify ----- // item.dataHash = element.getElementsByAttributeValue("onclick", "verify_modern(this)").stream() .findFirst().map(n -> n.attr("data-hash")).orElse("").trim(); // ----- Mods ----- // Elements itemModsElements = element.getElementsByClass("item-mods"); if (itemModsElements != null && itemModsElements.size() > 0) { Element itemMods = itemModsElements.get(0); if (itemMods.getElementsByClass("bullet-item").size() != 0) { Element bulletItem = itemMods.getElementsByClass("bullet-item").get(0); Elements ulMods = bulletItem.getElementsByTag("ul"); if (ulMods.size() == 2) { // implicit mod Elements implicitLIs = ulMods.get(0).getElementsByTag("li"); Element implicitLi = implicitLIs.last(); Mod impMod = new Mod(implicitLi.attr("data-name"), implicitLi.attr("data-value")); item.implicitMod = impMod; } int indexOfExplicitMods = ulMods.size() - 1; Elements modsLi = ulMods.get(indexOfExplicitMods).getElementsByTag("li"); for (Element modLi : modsLi) { // explicit mods Mod mod = new Mod(modLi.attr("data-name"), modLi.attr("data-value")); item.explicitMods.add(mod); } } } // ----- Properties ----- // // this is the third column data (the first col is the image, second is the mods, reqs) item.quality = element.getElementsByAttributeValue("data-name", "q").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.physDmgRangeAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pd").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.eleDmgRange = element.getElementsByAttributeValue("data-name", "ed").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.attackSpeed = element.getElementsByAttributeValue("data-name", "aps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.dmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_dps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.physDmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pdps").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.eleDmg = element.getElementsByAttributeValue("data-name", "edps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.armourAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_armour").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.evasionAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_evasion").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.energyShieldAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_shield") .get(0).text().replaceAll(regex_horizontal_whitespace, "").trim(); item.block = element.getElementsByAttributeValue("data-name", "block").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.crit = element.getElementsByAttributeValue("data-name", "crit").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.level = element.getElementsByAttributeValue("data-name", "level").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.imageUrl = element.getElementsByAttributeValue("alt", "Item icon").get(0).attr("src"); item.stackSize = asList(split(trimToEmpty(item.imageUrl), '&')).stream() .filter(t -> t.startsWith("stackSize=")).findFirst().map(s -> substringAfter(s, "=")) .orElse(null); Elements onlineSpans = element.getElementsMatchingText("online"); if (!onlineSpans.isEmpty()) { item.online = "Online"; } else { item.online = ""; } tradeItems.add(item); } // System.out.println("DONE --- Items"); return tradeItems; }
From source file:xxx.web.comments.debates.impl.ProConOrgCommentsParser.java
/** * Extracts the document of the quote// w ww . j a va2s . c o m * * @param textElement text quote element * @return plain string with paragraphs kept */ protected static String extractPlainTextFromTextElement(Element textElement) { StringBuilder sb = new StringBuilder(); for (Node childNode : textElement.childNodes()) { if (childNode instanceof Element) { Element childElement = (Element) childNode; String tagName = childElement.tagName(); if ("p".equals(tagName) || "span".equals(tagName)) { sb.append(childElement.text()); sb.append("\n"); } else if ("br".equals(tagName)) { // prevent double newlines sb = new StringBuilder(sb.toString().trim()); sb.append("\n"); } } else if (childNode instanceof TextNode) { TextNode textNode = (TextNode) childNode; sb.append(textNode.text()); } } // remove leading + ending quotes return Utils.normalize(sb.toString()).replaceAll("[(^\")(\"$)]", ""); }
From source file:xxx.web.comments.debates.impl.ProConOrgParser.java
/** * Extracts the document of the quote//from w w w .ja va 2 s. c o m * * @param textElement text quote element * @return plain string with paragraphs kept */ public static String extractPlainTextFromTextElement(Element textElement) { StringBuilder sb = new StringBuilder(); for (Node childNode : textElement.childNodes()) { if (childNode instanceof Element) { Element childElement = (Element) childNode; String tagName = childElement.tagName(); if ("p".equals(tagName) || "span".equals(tagName)) { sb.append(childElement.text()); sb.append("\n"); } else if ("br".equals(tagName)) { // prevent double newlines sb = new StringBuilder(sb.toString().trim()); sb.append("\n"); } } else if (childNode instanceof TextNode) { TextNode textNode = (TextNode) childNode; sb.append(textNode.text()); } } // remove leading + ending quotes return Utils.normalize(sb.toString()).replaceAll("[(^\")(\"$)]", ""); }
From source file:xxx.web.comments.roomfordebate.NYTimesArticleExtractor.java
public Article extractArticle(String html) throws ParseException, IOException { Article result = new Article(); Document doc = Jsoup.parse(html, getBaseName()); Element element = doc.select("article.rfd").iterator().next(); // System.out.println(element); String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", ""); // time//w w w. j a va 2s .c om try { DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } catch (ParseException e) { // June 24, 2015 DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } // title result.setTitle(Utils.normalize(element.select("h1").text())); // text StringBuilder sb = new StringBuilder(); for (Element p : element.select("div.nytint-post > p")) { sb.append(p.text()); sb.append("\n"); } result.setText(Utils.normalize(sb.toString())); // debate title result.setDebateTitle(Utils.normalize(doc.select("div.nytint-discussion-overview > h2").text())); // debate url result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href")); // document url result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content")); // debate description result.setDebateDescription(Utils.normalize(((TextNode) doc.select("div.nytint-discussion-overview > p") .iterator().next().childNodes().iterator().next()).text())); // aurhor result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt")); // topics for (Element a : element.select("p.nytint-tags > a")) { result.getTopics().add(a.attr("href")); } return result; }