List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:com.jimplush.goose.ContentExtractor.java
/** * checks the density of links within a node, is there not much text and most of it contains linky shit? * if so it's no good// ww w .j a v a 2s . c o m * * @param e * @return */ private static boolean isHighLinkDensity(Element e) { Elements links = e.getElementsByTag("a"); if (links.size() == 0) { return false; } String text = e.text().trim(); String[] words = SPACE_SPLITTER.split(text); float numberOfWords = words.length; // let's loop through all the links and calculate the number of words that make up the links StringBuilder sb = new StringBuilder(); for (Element link : links) { sb.append(link.text()); } String linkText = sb.toString(); String[] linkWords = SPACE_SPLITTER.split(linkText); float numberOfLinkWords = linkWords.length; float numberOfLinks = links.size(); float linkDivisor = numberOfLinkWords / numberOfWords; float score = linkDivisor * numberOfLinks; if (logger.isDebugEnabled()) { String logText; if (e.text().length() >= 51) { logText = e.text().substring(0, 50); } else { logText = e.text(); } logger.debug("Calulated link density score as: " + score + " for node: " + logText); } if (score > 1) { return true; } return false; }
From source file:org.ala.lucene.CreateWordPressIndex.java
/** * Read the Google sitemap file on WP site and load up a list * of page URL./*from w w w .j av a 2 s .c o m*/ * * @throws IOException */ protected void loadSitemap() throws IOException { Document doc = Jsoup.connect(WP_SITEMAP_URI).get(); Elements pages = doc.select("loc"); logger.info("Sitemap file lists " + pages.size() + " pages."); for (Element page : pages) { // add it to list of page urls Field this.pageUrls.add(page.text()); } }
From source file:crawler.AScraper.java
@Filter(inputChannel = "channel2", outputChannel = "channel3") public boolean filter(Element payload) { Matcher m = patter.matcher(payload.text()); if (m.find()) { return true; } else if (payload.text().startsWith("Beautyleg")) { return false; } else {/*from w w w .jav a 2 s . c om*/ LOG.error(String.format("Anchor text dose not match pattern:[%s]", payload.text())); return false; } }
From source file:io.seldon.importer.articles.ItemAttributesImporter.java
public static Map<String, String> getAttributes(String url, String existingCategory) { ItemProcessResult itemProcessResult = new ItemProcessResult(); itemProcessResult.client_item_id = url; itemProcessResult.extraction_status = "EXTRACTION_FAILED"; logger.info("Trying to get attributes for " + url); Map<String, String> attributes = null; String title = ""; String category = ""; String subCategory = ""; String img_url = ""; String description = ""; String tags = ""; String leadtext = ""; String link = ""; String publishDate = ""; String domain = ""; try {/* w w w .ja v a 2 s .c om*/ long now = System.currentTimeMillis(); long timeSinceLastRequest = now - lastUrlFetchTime; if (timeSinceLastRequest < minFetchGapMsecs) { long timeToSleep = minFetchGapMsecs - timeSinceLastRequest; logger.info( "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest); Thread.sleep(timeToSleep); } Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get(); lastUrlFetchTime = System.currentTimeMillis(); //get IMAGE URL if (StringUtils.isNotBlank(imageCssSelector)) { Element imageElement = articleDoc.select(imageCssSelector).first(); if (imageElement != null && imageElement.attr("content") != null) { img_url = imageElement.attr("content"); } if (imageElement != null && StringUtils.isBlank(img_url)) { img_url = imageElement.attr("src"); } if (imageElement != null && StringUtils.isBlank(img_url)) { img_url = imageElement.attr("href"); } } if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) { logger.info("Setting image to default: " + defImageUrl); img_url = defImageUrl; } img_url = StringUtils.strip(img_url); //get TITLE if (StringUtils.isNotBlank(titleCssSelector)) { Element titleElement = articleDoc.select(titleCssSelector).first(); if ((titleElement != null) && (titleElement.attr("content") != null)) { title = titleElement.attr("content"); } // if still blank get from text instead if (StringUtils.isBlank(title) && (titleElement != null)) { title = titleElement.text(); } } //get LEAD TEXT if (StringUtils.isNotBlank(leadTextCssSelector)) { Element leadElement = articleDoc.select(leadTextCssSelector).first(); if (leadElement != null && leadElement.attr("content") != null) { leadtext = leadElement.attr("content"); } } //get publish date if (StringUtils.isNotBlank(publishDateCssSelector)) { //2013-01-21T10:40:55Z Element pubElement = articleDoc.select(publishDateCssSelector).first(); if (pubElement != null && pubElement.attr("content") != null) { String pubtext = pubElement.attr("content"); SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); Date result = null; try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date withUTC format " + pubtext); } //try a simpler format df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date " + pubtext); } if (result != null) publishDate = dateFormatter.format(result); else logger.error("Failed to parse date " + pubtext); } } //get Link if (StringUtils.isNotBlank(linkCssSelector)) { Element linkElement = articleDoc.select(linkCssSelector).first(); if (linkElement != null && linkElement.attr("content") != null) { link = linkElement.attr("content"); } } //get CONTENT if (StringUtils.isNotBlank(textCssSelector)) { Element descriptionElement = articleDoc.select(textCssSelector).first(); if (descriptionElement != null) description = Jsoup.parse(descriptionElement.html()).text(); } //get TAGS Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title); if (tagSet.size() > 0) tags = CollectionTools.join(tagSet, ","); //get CATEGORY - client specific if (StringUtils.isNotBlank(categoryCssSelector)) { Element categoryElement = articleDoc.select(categoryCssSelector).first(); if (categoryElement != null && categoryElement.attr("content") != null) { category = categoryElement.attr("content"); if (StringUtils.isNotBlank(category)) category = category.toUpperCase(); } } else if (StringUtils.isNotBlank(categoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + categoryClassPrefix + "CategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); category = extractor.getCategory(url, articleDoc); } //get Sub CATEGORY - client specific if (StringUtils.isNotBlank(subCategoryCssSelector)) { Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first(); if (subCategoryElement != null && subCategoryElement.attr("content") != null) { subCategory = subCategoryElement.attr("content"); if (StringUtils.isNotBlank(subCategory)) subCategory = category.toUpperCase(); } } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix + "SubCategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); subCategory = extractor.getCategory(url, articleDoc); } // Get domain if (domainIsNeeded) { domain = getDomain(url); } if ((StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url)) && (categoryNotNeeded || StringUtils.isNotBlank(category)) && (!domainIsNeeded || StringUtils.isNotBlank(domain)))) { attributes = new HashMap<String, String>(); attributes.put(TITLE, title); if (StringUtils.isNotBlank(category)) attributes.put(CATEGORY, category); if (StringUtils.isNotBlank(subCategory)) attributes.put(SUBCATEGORY, subCategory); if (StringUtils.isNotBlank(link)) attributes.put(LINK, link); if (StringUtils.isNotBlank(leadtext)) attributes.put(LEAD_TEXT, leadtext); if (StringUtils.isNotBlank(img_url)) attributes.put(IMG_URL, img_url); if (StringUtils.isNotBlank(tags)) attributes.put(TAGS, tags); attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE); if (StringUtils.isNotBlank(description)) attributes.put(DESCRIPTION, description); if (StringUtils.isNotBlank(publishDate)) attributes.put(PUBLISH_DATE, publishDate); if (StringUtils.isNotBlank(domain)) attributes.put(DOMAIN, domain); System.out.println("Item: " + url + "; Category: " + category + " SubCategory: " + subCategory); itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED"; } else { logger.warn("Failed to get needed attributes for article " + url); logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain=" + domain + "]"); } { // check for failures for the log result if (StringUtils.isBlank(title)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title"; } if (!imageNotNeeded && StringUtils.isBlank(img_url)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url"; } if (!categoryNotNeeded && StringUtils.isBlank(category)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "category"; } } } catch (Exception e) { logger.error("Article: " + url + ". Attributes import FAILED", e); itemProcessResult.error = e.toString(); } AttributesImporterUtils.logResult(logger, itemProcessResult); return attributes; }
From source file:de.geeksfactory.opacclient.apis.TouchPoint.java
static List<ReservedItem> parse_reslist(Document doc) { List<ReservedItem> reservations = new ArrayList<>(); Elements copytrs = doc.select(".data tr"); int trs = copytrs.size(); if (trs <= 1) { return null; }/*from w ww.jav a2s .com*/ for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); ReservedItem item = new ReservedItem(); if (tr.text().contains("keine Daten") || tr.children().size() == 1) { return null; } item.setTitle(tr.child(2).select("b, strong").text().trim()); try { String[] rowsplit2 = tr.child(2).html().split("<br[ /]*>"); String[] rowsplit3 = tr.child(3).html().split("<br[ /]*>"); if (rowsplit2.length > 1) item.setAuthor(rowsplit2[1].trim()); if (rowsplit3.length > 2) item.setBranch(rowsplit3[2].trim()); if (rowsplit3.length > 2) { item.setStatus(rowsplit3[0].trim() + " (" + rowsplit3[1].trim() + ")"); } } catch (Exception e) { e.printStackTrace(); } reservations.add(item); } return reservations; }
From source file:fr.arlefebvre.pronostics.controller.UEFATeamsController.java
@RequestMapping("/uefa/teams") public List<Team> teams() { if (pseudoCache != null && !pseudoCache.isEmpty()) return pseudoCache; ArrayList<Team> result = new ArrayList<Team>(); String uri = "http://fr.fifa.com/fifa-world-ranking/ranking-table/men/uefa.html"; //On se connecte au site et on charge le document html Document doc;//from ww w. ja v a 2s . c o m try { doc = Jsoup.connect(uri).get(); Elements elements = doc.getElementsByClass("table"); for (Element element : elements) { Element tbody = element.getElementsByTag("tbody").first(); for (Element child : tbody.children()) { Element teamNameElement = child.getElementsByClass("tbl-teamname").first(); String name = teamNameElement.text(); String countryCode = child.getElementsByClass("tbl-countrycode").first().text(); String imgUrl = teamNameElement.select("img").first().absUrl("src"); Team team = new Team(); team.setName(name); team.setCountryCode(countryCode); team.setImgUrl(imgUrl); team.setNationalTeam(true); result.add(team); } } //String titre = element.text(); } catch (IOException e) { e.printStackTrace(); } // RestTemplate restTemplate = new RestTemplate(); // ResponseEntity<ChampionListDto> response = restTemplate.getForEntity( // uri, // ChampionListDto.class); // // List<ChampionDto> champions = response.getBody().getChampions(); // return champions.stream().map(c -> getChampionById(c.getId()).getName()).collect(Collectors.toList()); result.sort((t1, t2) -> t1.getName().compareTo(t2.getName())); if (pseudoCache == null) pseudoCache = result; return result; }
From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesArticleExtractor.java
public Article extractArticle(String html) throws ParseException, IOException { Article result = new Article(); Document doc = Jsoup.parse(html, getBaseName()); Element element;// w w w . ja va 2 s .c o m try { element = doc.select("article.rfd").iterator().next(); } catch (NoSuchElementException exception) { throw new IOException("Cannot find article.rfd element"); } // System.out.println(element); String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", ""); // time try { DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } catch (ParseException e) { // June 24, 2015 DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } // title result.setTitle(TextCleaningUtils.normalizeWithParagraphs(element.select("h1").text())); // text StringBuilder sb = new StringBuilder(); for (Element p : element.select("div.nytint-post > p")) { sb.append(p.text()); sb.append("\n"); } result.setText(TextCleaningUtils.normalizeWithParagraphs(sb.toString())); // debate title result.setDebateTitle(TextCleaningUtils .normalizeWithParagraphs(doc.select("div.nytint-discussion-overview > h2").text())); // debate url result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href")); // document url result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content")); // debate description result.setDebateDescription(TextCleaningUtils .normalizeWithParagraphs(((TextNode) doc.select("div.nytint-discussion-overview > p").iterator() .next().childNodes().iterator().next()).text())); // aurhor result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt")); // topics for (Element a : element.select("p.nytint-tags > a")) { result.getTopics().add(a.attr("href")); } return result; }
From source file:org.talend.license.LicenseRetriver.java
public Collection<File> updateLicense(final String version, final File file) { logger.info("start to update {} license ", version); String url = String.format(Configer.getBuildURL() + Configer.getLicenseURL(), version); Document doc = connector.getPage(url); if (null == doc) { logger.error("no {} license page url:{}", version, url); return null; }//from w w w.j av a 2 s . co m String regex = String.format(Configer.getLicenseItem(), version); Elements eles = doc.getElementsMatchingOwnText(regex); if (eles.isEmpty()) { logger.error("no {} license page url:{}", version, url); return null; } final Pattern pattern = Pattern.compile(regex); SortedSet<String> set = new TreeSet<String>(new Comparator<String>() { public int compare(String o1, String o2) { String m1; String m2; Matcher matcher = pattern.matcher(o1); if (matcher.find()) { m1 = matcher.group(2); } else { return 1; } matcher = pattern.matcher(o2); if (matcher.find()) { m2 = matcher.group(2); } else { return -1; } return m2.compareTo(m1); } }); logger.info("there are {} license build", eles.size()); for (Element ele : eles) { String text = ele.text(); set.add(text); } if (set.isEmpty()) { return null; } Iterator<String> ite = set.iterator(); while (ite.hasNext()) { String target = ite.next(); url = url + target; logger.info("retrive from newest build {}", url); Collection<File> fs = checkout(version, file, url); if (!fs.isEmpty()) { return fs; } logger.info("no available license in build"); } logger.error("retrive license failed"); return null; }
From source file:de.geeksfactory.opacclient.apis.TouchPoint.java
static List<LentItem> parse_medialist(Document doc) { List<LentItem> media = new ArrayList<>(); Elements copytrs = doc.select(".data tr"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs == 1) { return null; }/*ww w .java 2 s .c o m*/ assert (trs > 0); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); LentItem item = new LentItem(); if (tr.text().contains("keine Daten")) { return null; } item.setTitle(tr.select(".account-display-title").select("b, strong").text().trim()); try { item.setAuthor(tr.select(".account-display-title").html().split("<br[ /]*>")[1].trim()); String[] col3split = tr.select(".account-display-state").html().split("<br[ /]*>"); String deadline = Jsoup.parse(col3split[0].trim()).text().trim(); if (deadline.contains(":")) { // BSB Munich: <span class="hidden-sm hidden-md hidden-lg">Flligkeitsdatum : </span>26.02.2016<br> deadline = deadline.split(":")[1].trim(); } if (deadline.contains("-")) { // Chemnitz: 22.07.2015 - 20.10.2015<br> deadline = deadline.split("-")[1].trim(); } try { item.setDeadline(fmt.parseLocalDate(deadline).toString()); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } if (col3split.length > 1) item.setHomeBranch(col3split[1].trim()); if (tr.select("a").size() > 0) { for (Element link : tr.select("a")) { String href = link.attr("abs:href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (hrefq.get("methodToCall").equals("renewal")) { item.setProlongData(href.split("\\?")[1]); item.setRenewable(true); break; } } } } catch (Exception ex) { ex.printStackTrace(); } media.add(item); } return media; }