List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:org.cellcore.code.engine.page.extractor.mtgf.MTGFPageDataExtractor.java
@Override protected float getPrice(Document doc) { String val = cleanPriceString(doc.select(".card-buy").get(0).select(".price").get(0).text()); return Float.parseFloat(val); }
From source file:org.cellcore.code.engine.page.extractor.pkg.PKGPageDataExtractor.java
@Override protected int getStock(Document doc) { String stock = doc.select("option").last().text(); return Integer.parseInt(stock); }
From source file:io.seldon.importer.articles.ItemAttributesImporter.java
public static Map<String, String> getAttributes(String url, String existingCategory) { ItemProcessResult itemProcessResult = new ItemProcessResult(); itemProcessResult.client_item_id = url; itemProcessResult.extraction_status = "EXTRACTION_FAILED"; logger.info("Trying to get attributes for " + url); Map<String, String> attributes = null; String title = ""; String category = ""; String subCategory = ""; String img_url = ""; String description = ""; String tags = ""; String leadtext = ""; String link = ""; String publishDate = ""; String domain = ""; try {/*from w w w.j a va 2 s . c o m*/ long now = System.currentTimeMillis(); long timeSinceLastRequest = now - lastUrlFetchTime; if (timeSinceLastRequest < minFetchGapMsecs) { long timeToSleep = minFetchGapMsecs - timeSinceLastRequest; logger.info( "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest); Thread.sleep(timeToSleep); } Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get(); lastUrlFetchTime = System.currentTimeMillis(); //get IMAGE URL if (StringUtils.isNotBlank(imageCssSelector)) { Element imageElement = articleDoc.select(imageCssSelector).first(); if (imageElement != null && imageElement.attr("content") != null) { img_url = imageElement.attr("content"); } if (imageElement != null && StringUtils.isBlank(img_url)) { img_url = imageElement.attr("src"); } if (imageElement != null && StringUtils.isBlank(img_url)) { img_url = imageElement.attr("href"); } } if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) { logger.info("Setting image to default: " + defImageUrl); img_url = defImageUrl; } img_url = StringUtils.strip(img_url); //get TITLE if (StringUtils.isNotBlank(titleCssSelector)) { Element titleElement = articleDoc.select(titleCssSelector).first(); if ((titleElement != null) && (titleElement.attr("content") != null)) { title = titleElement.attr("content"); } // if still blank get from text instead if (StringUtils.isBlank(title) && (titleElement != null)) { title = titleElement.text(); } } //get LEAD TEXT if (StringUtils.isNotBlank(leadTextCssSelector)) { Element leadElement = articleDoc.select(leadTextCssSelector).first(); if (leadElement != null && leadElement.attr("content") != null) { leadtext = leadElement.attr("content"); } } //get publish date if (StringUtils.isNotBlank(publishDateCssSelector)) { //2013-01-21T10:40:55Z Element pubElement = articleDoc.select(publishDateCssSelector).first(); if (pubElement != null && pubElement.attr("content") != null) { String pubtext = pubElement.attr("content"); SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); Date result = null; try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date withUTC format " + pubtext); } //try a simpler format df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date " + pubtext); } if (result != null) publishDate = dateFormatter.format(result); else logger.error("Failed to parse date " + pubtext); } } //get Link if (StringUtils.isNotBlank(linkCssSelector)) { Element linkElement = articleDoc.select(linkCssSelector).first(); if (linkElement != null && linkElement.attr("content") != null) { link = linkElement.attr("content"); } } //get CONTENT if (StringUtils.isNotBlank(textCssSelector)) { Element descriptionElement = articleDoc.select(textCssSelector).first(); if (descriptionElement != null) description = Jsoup.parse(descriptionElement.html()).text(); } //get TAGS Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title); if (tagSet.size() > 0) tags = CollectionTools.join(tagSet, ","); //get CATEGORY - client specific if (StringUtils.isNotBlank(categoryCssSelector)) { Element categoryElement = articleDoc.select(categoryCssSelector).first(); if (categoryElement != null && categoryElement.attr("content") != null) { category = categoryElement.attr("content"); if (StringUtils.isNotBlank(category)) category = category.toUpperCase(); } } else if (StringUtils.isNotBlank(categoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + categoryClassPrefix + "CategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); category = extractor.getCategory(url, articleDoc); } //get Sub CATEGORY - client specific if (StringUtils.isNotBlank(subCategoryCssSelector)) { Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first(); if (subCategoryElement != null && subCategoryElement.attr("content") != null) { subCategory = subCategoryElement.attr("content"); if (StringUtils.isNotBlank(subCategory)) subCategory = category.toUpperCase(); } } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) { String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix + "SubCategoryExtractor"; Class<?> clazz = Class.forName(className); Constructor<?> ctor = clazz.getConstructor(); CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance(); subCategory = extractor.getCategory(url, articleDoc); } // Get domain if (domainIsNeeded) { domain = getDomain(url); } if ((StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url)) && (categoryNotNeeded || StringUtils.isNotBlank(category)) && (!domainIsNeeded || StringUtils.isNotBlank(domain)))) { attributes = new HashMap<String, String>(); attributes.put(TITLE, title); if (StringUtils.isNotBlank(category)) attributes.put(CATEGORY, category); if (StringUtils.isNotBlank(subCategory)) attributes.put(SUBCATEGORY, subCategory); if (StringUtils.isNotBlank(link)) attributes.put(LINK, link); if (StringUtils.isNotBlank(leadtext)) attributes.put(LEAD_TEXT, leadtext); if (StringUtils.isNotBlank(img_url)) attributes.put(IMG_URL, img_url); if (StringUtils.isNotBlank(tags)) attributes.put(TAGS, tags); attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE); if (StringUtils.isNotBlank(description)) attributes.put(DESCRIPTION, description); if (StringUtils.isNotBlank(publishDate)) attributes.put(PUBLISH_DATE, publishDate); if (StringUtils.isNotBlank(domain)) attributes.put(DOMAIN, domain); System.out.println("Item: " + url + "; Category: " + category + " SubCategory: " + subCategory); itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED"; } else { logger.warn("Failed to get needed attributes for article " + url); logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain=" + domain + "]"); } { // check for failures for the log result if (StringUtils.isBlank(title)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title"; } if (!imageNotNeeded && StringUtils.isBlank(img_url)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url"; } if (!categoryNotNeeded && StringUtils.isBlank(category)) { itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "category"; } } } catch (Exception e) { logger.error("Article: " + url + ". Attributes import FAILED", e); itemProcessResult.error = e.toString(); } AttributesImporterUtils.logResult(logger, itemProcessResult); return attributes; }
From source file:org.cellcore.code.engine.page.extractor.mcc.MCCPageDataExtractor.java
@Override protected String getName(Document doc) { return doc.select("#blockContent").get(3).select("b").get(0).childNodes().get(0).attr("text"); }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
static void parseMediaList(List<LentItem> media, Document doc, JSONObject data) { if (doc.select("a[name=AUS]").size() == 0) return;//from w w w .j a va2s . co m Elements copytrs = doc.select("a[name=AUS] ~ table, a[name=AUS] ~ form table").first().select("tr"); doc.setBaseUri(data.optString("baseurl")); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs < 2) { return; } assert (trs > 0); JSONObject copymap = new JSONObject(); try { if (data.has("accounttable")) { copymap = data.getJSONObject("accounttable"); } } catch (JSONException e) { } Pattern datePattern = Pattern.compile("\\d{2}\\.\\d{2}\\.\\d{4}"); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); LentItem item = new LentItem(); if (copymap.optInt("title", 0) >= 0) { item.setTitle(tr.child(copymap.optInt("title", 0)).text().trim().replace("\u00a0", "")); } if (copymap.optInt("author", 1) >= 0) { item.setAuthor(tr.child(copymap.optInt("author", 1)).text().trim().replace("\u00a0", "")); } if (copymap.optInt("format", 2) >= 0) { item.setFormat(tr.child(copymap.optInt("format", 2)).text().trim().replace("\u00a0", "")); } int prolongCount = 0; if (copymap.optInt("prolongcount", 3) >= 0) { prolongCount = Integer .parseInt(tr.child(copymap.optInt("prolongcount", 3)).text().trim().replace("\u00a0", "")); item.setStatus(String.valueOf(prolongCount) + "x verl."); } if (data.optInt("maxprolongcount", -1) != -1) { item.setRenewable(prolongCount < data.optInt("maxprolongcount", -1)); } if (copymap.optInt("returndate", 4) >= 0) { String value = tr.child(copymap.optInt("returndate", 4)).text().trim().replace("\u00a0", ""); Matcher matcher = datePattern.matcher(value); if (matcher.find()) { try { item.setDeadline(fmt.parseLocalDate(matcher.group())); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } } } if (copymap.optInt("prolongurl", 5) >= 0) { if (tr.children().size() > copymap.optInt("prolongurl", 5)) { Element cell = tr.child(copymap.optInt("prolongurl", 5)); if (cell.select("input[name=MedNrVerlAll]").size() > 0) { // new iOPAC Version 1.45 - checkboxes to prolong multiple items // internal convention: We add "NEW" to the media ID to show that we have // the new iOPAC version Element input = cell.select("input[name=MedNrVerlAll]").first(); String value = input.val(); item.setProlongData("NEW" + value); item.setId(value.split(";")[0]); if (input.hasAttr("disabled")) item.setRenewable(false); } else { // previous versions - link for prolonging on every medium String link = cell.select("a").attr("href"); item.setProlongData(link); // find media number with regex Pattern pattern = Pattern.compile("mednr=([^&]*)&"); Matcher matcher = pattern.matcher(link); if (matcher.find() && matcher.group() != null) item.setId(matcher.group(1)); } } } media.add(item); } assert (media.size() == trs - 1); }
From source file:org.cellcore.code.engine.page.extractor.mb.MBPageDataExtractor.java
protected String[] getOtherNames(Document doc) { String frName = doc.select(".text").get(1).childNodes().get(0).attr("text").trim(); return new String[] { frName }; }
From source file:org.cellcore.code.engine.page.extractor.mfrag.MFRAGPageDataExtractor.java
@Override protected int getStock(Document doc) { Elements trs = doc.select("#Tableau").get(0).children().get(0).children(); float iPrice = Float.MAX_VALUE; int iStock = 0; for (int i = 1; i < trs.size(); i++) { Element tr = trs.get(i);//from w ww. jav a 2 s. c om String val = tr.select("td").get(3).select("strong").get(0).childNodes().get(0).attr("text"); String stockV = tr.select("td").get(4).select("option").last().childNodes().get(0).attr("text"); val = cleanPriceString(val); float price = Float.parseFloat(val); if (price < iPrice) { iPrice = price; iStock = Integer.parseInt(stockV.replaceAll("\\(", "").replaceAll("\\)", "")); } } return iStock; }
From source file:ddf.catalog.transformer.html.HtmlMetacardTransformerTest.java
@Test public void testMetacardTransform() throws CatalogTransformerException, IOException { Metacard metacard = new MetacardImpl(); HtmlMetacardTransformer htmlTransformer = new HtmlMetacardTransformer(EMPTY_CATEGORY_LIST); BinaryContent binaryContent = htmlTransformer.transform(metacard, Collections.emptyMap()); Document doc = getHtmlDocument(binaryContent); assertThat(doc.select(METACARD_CLASS), hasSize(1)); }
From source file:org.cellcore.code.engine.page.extractor.mtgf.MTGFPageDataExtractor.java
@Override protected String getName(Document doc) throws UnsupportedCardException { String name = doc.select(".name").get(0).select("h1").text(); return name;//from w ww .j a v a 2s. com }
From source file:org.cellcore.code.engine.page.extractor.mcc.MCCPageDataExtractor.java
@Override protected String[] getOtherNames(Document doc) { String frName = doc.select("#blockContent").get(4).select("b").get(0).childNodes().get(0).attr("text"); return new String[] { frName }; }