List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:dk.dma.msiproxy.common.provider.AbstractProviderService.java
/** * If the given element attribute references a message repo folder, add the message ID to the ids list. * @param ids the message ID list/*from w w w . j a v a2 s . c o m*/ * @param doc the HTML document * @param tag the HTML tag to process * @param attr the attribute of the HTML tag to process */ private void computeReferencedMessageIds(Set<Integer> ids, Document doc, String tag, String attr) { doc.select(String.format("%s[%s]", tag, attr)).stream().filter(e -> e.attr(tag) != null).forEach(e -> { Matcher m = MESSAGE_REPO_FILE_PATTERN.matcher(e.attr(attr)); if (m.matches()) { ids.add(Integer.valueOf(m.group("id"))); } }); }
From source file:io.seldon.importer.articles.dynamicextractors.FirstElementTextValueDateDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;// w w w. j av a 2 s . c o m if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 1)) { String cssSelector = attributeDetail.extractor_args.get(0); Element element = articleDoc.select(cssSelector).first(); if (StringUtils.isNotBlank(cssSelector)) { if (element != null) { attrib_value = element.text(); } } } if (attrib_value != null) { String pubtext = attrib_value; SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); DateFormat df = new SimpleDateFormat("dd/mm/yyyy hh:mm", Locale.ENGLISH); Date result = null; try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date withUTC format " + pubtext); } // try a simpler format df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date " + pubtext); } if (result != null) { attrib_value = dateFormatter.format(result); } else { logger.error("Failed to parse date " + pubtext); } } return attrib_value; }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoDuration(Document doc, Video video) { Elements dmElements = doc.select("div#video_length span.text"); if (CollectionUtils.isNotEmpty(dmElements)) { String durationMinutes = dmElements.first().text().toString(); video.setDurationMinutes(Integer.valueOf(durationMinutes)); }/*from www. j a va 2 s. c o m*/ }
From source file:ru.org.linux.user.AddPhotoWebTest.java
@Test /**/*from w w w . j a va2 s.c o m*/ * ? */ public void testInvalid2Image() throws IOException { String auth = WebHelper.doLogin(resource, "JB", "passwd"); ClientResponse cr = WebHelper.addPhoto(resource, "src/main/webapp/img/tux.png", auth); assertEquals(HttpStatus.SC_BAD_REQUEST, cr.getStatus()); Document doc = Jsoup.parse(cr.getEntityInputStream(), "UTF-8", resource.getURI().toString()); assertEquals( "! ?: ? ", doc.select(".error").text()); // ? }
From source file:io.seldon.importer.articles.dynamicextractors.CategoryFromKeywordsDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;//from w ww. j a v a 2s. c om String[] tags = null; if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 3)) { String cssSelector = attributeDetail.extractor_args.get(0); Element element = articleDoc.select(cssSelector).first(); if (StringUtils.isNotBlank(cssSelector)) { String value_name = attributeDetail.extractor_args.get(1); if (element != null && element.attr(value_name) != null) { String rawList = element.attr(value_name); if (StringUtils.isNotBlank(rawList)) { tags = rawList.split(","); for (int i = 0; i < tags.length; i++) { tags[i] = tags[i].trim().toLowerCase(); } attrib_value = StringUtils.join(tags, ','); } } } } if (StringUtils.isNotBlank(attrib_value)) { String[] categories = attributeDetail.extractor_args.get(2).split(","); for (String category : categories) { for (String tag : tags) if (category.equals(tag)) return tag; } } return null; }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoReleaseDate(Document doc, Video video) { Elements rdElements = doc.select("div#video_date td.text"); if (CollectionUtils.isNotEmpty(rdElements)) { String releaseDate = rdElements.first().text().toString(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); try {/*from ww w . j av a 2s .c om*/ Date date = sdf.parse(releaseDate); video.setReleaseDate(date); } catch (ParseException e) { } } }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoScore(Document doc, Video video) { Elements sElements = doc.select("div#video_review td.text span.score"); if (CollectionUtils.isNotEmpty(sElements)) { String score = sElements.first().text().toString(); score = StringUtils.replace(score, "(", ""); score = StringUtils.replace(score, ")", ""); if (StringUtils.isNotBlank(score)) { try { video.setScore(Float.valueOf(score)); } catch (Exception e) { }//from w ww. ja v a2 s. c om } } }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoSingleFemaleFlag(Document doc, Video video) { Elements actressElements = doc.select("div#video_cast span.star"); if (CollectionUtils.isNotEmpty(actressElements)) { if (actressElements.size() <= 1) { video.setSingleFemaleFlag(true); } else {/*from ww w. j a v a2 s . c o m*/ video.setSingleFemaleFlag(false); } } }
From source file:org.cellcore.code.engine.page.extractor.pkg.PKGPageDataExtractor.java
@Override protected float getPrice(Document doc) { String price = ((Element) doc.getElementsByAttributeValueStarting("src", "./docs/illustrations/").get(0) .parent().parent().childNodes().get(5)).select("tbody").get(1).select("font").get(0).select("b") .text();//from w w w .ja v a2 s .c om if (price == null || price.equals("")) { price = doc.select("form").get(2).select("b").get(0).text(); } price = cleanPriceString(price); return Float.parseFloat(price); }
From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java
private void setVideoPeople(Document doc, Video video) { Elements dElements = doc.select("div#video_director td.text"); if (CollectionUtils.isNotEmpty(dElements)) { String director = dElements.first().text().toString(); video.setDirector(director);/*from ww w . j a v a 2 s .c o m*/ } Elements pElements = doc.select("div#video_maker td.text"); if (CollectionUtils.isNotEmpty(pElements)) { String producer = pElements.first().text().toString(); video.setProducer(producer); } Elements disElements = doc.select("div#video_label td.text"); if (CollectionUtils.isNotEmpty(disElements)) { String distributor = disElements.first().text().toString(); video.setDistributor(distributor); } }