Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:dk.dma.msiproxy.common.provider.AbstractProviderService.java

/**
 * If the given element attribute references a message repo folder, add the message ID to the ids list.
 * @param ids the message ID list/*from  w  w w  . j  a v  a2 s  .  c  o  m*/
 * @param doc the HTML document
 * @param tag the HTML tag to process
 * @param attr the attribute of the HTML tag to process
 */
private void computeReferencedMessageIds(Set<Integer> ids, Document doc, String tag, String attr) {
    doc.select(String.format("%s[%s]", tag, attr)).stream().filter(e -> e.attr(tag) != null).forEach(e -> {
        Matcher m = MESSAGE_REPO_FILE_PATTERN.matcher(e.attr(attr));
        if (m.matches()) {
            ids.add(Integer.valueOf(m.group("id")));
        }
    });
}

From source file:io.seldon.importer.articles.dynamicextractors.FirstElementTextValueDateDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {

    String attrib_value = null;//  w  w  w.  j  av  a 2 s  .  c  o  m

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 1)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        Element element = articleDoc.select(cssSelector).first();
        if (StringUtils.isNotBlank(cssSelector)) {
            if (element != null) {
                attrib_value = element.text();
            }
        }
    }

    if (attrib_value != null) {
        String pubtext = attrib_value;
        SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        DateFormat df = new SimpleDateFormat("dd/mm/yyyy hh:mm", Locale.ENGLISH);
        Date result = null;
        try {
            result = df.parse(pubtext);
        } catch (ParseException e) {
            logger.info("Failed to parse date withUTC format " + pubtext);
        }
        // try a simpler format
        df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
        try {
            result = df.parse(pubtext);
        } catch (ParseException e) {
            logger.info("Failed to parse date " + pubtext);
        }

        if (result != null) {
            attrib_value = dateFormatter.format(result);
        } else {
            logger.error("Failed to parse date " + pubtext);
        }

    }

    return attrib_value;
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoDuration(Document doc, Video video) {
    Elements dmElements = doc.select("div#video_length span.text");
    if (CollectionUtils.isNotEmpty(dmElements)) {
        String durationMinutes = dmElements.first().text().toString();
        video.setDurationMinutes(Integer.valueOf(durationMinutes));
    }/*from   www.  j a va 2  s.  c o m*/
}

From source file:ru.org.linux.user.AddPhotoWebTest.java

@Test
/**/*from w w w .  j  a va2 s.c o  m*/
 * ?  
 */
public void testInvalid2Image() throws IOException {
    String auth = WebHelper.doLogin(resource, "JB", "passwd");
    ClientResponse cr = WebHelper.addPhoto(resource, "src/main/webapp/img/tux.png", auth);
    assertEquals(HttpStatus.SC_BAD_REQUEST, cr.getStatus());
    Document doc = Jsoup.parse(cr.getEntityInputStream(), "UTF-8", resource.getURI().toString());
    assertEquals(
            "!   ?: ?  ",
            doc.select(".error").text()); // ?  
}

From source file:io.seldon.importer.articles.dynamicextractors.CategoryFromKeywordsDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {

    String attrib_value = null;//from  w  ww. j a v  a 2s. c  om

    String[] tags = null;
    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 3)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        Element element = articleDoc.select(cssSelector).first();
        if (StringUtils.isNotBlank(cssSelector)) {
            String value_name = attributeDetail.extractor_args.get(1);
            if (element != null && element.attr(value_name) != null) {
                String rawList = element.attr(value_name);
                if (StringUtils.isNotBlank(rawList)) {
                    tags = rawList.split(",");
                    for (int i = 0; i < tags.length; i++) {
                        tags[i] = tags[i].trim().toLowerCase();
                    }
                    attrib_value = StringUtils.join(tags, ',');
                }
            }
        }
    }

    if (StringUtils.isNotBlank(attrib_value)) {
        String[] categories = attributeDetail.extractor_args.get(2).split(",");
        for (String category : categories) {
            for (String tag : tags)
                if (category.equals(tag))
                    return tag;
        }
    }

    return null;
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoReleaseDate(Document doc, Video video) {
    Elements rdElements = doc.select("div#video_date td.text");
    if (CollectionUtils.isNotEmpty(rdElements)) {
        String releaseDate = rdElements.first().text().toString();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        try {/*from ww  w .  j av a 2s .c  om*/
            Date date = sdf.parse(releaseDate);
            video.setReleaseDate(date);
        } catch (ParseException e) {
        }
    }
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoScore(Document doc, Video video) {
    Elements sElements = doc.select("div#video_review td.text span.score");
    if (CollectionUtils.isNotEmpty(sElements)) {
        String score = sElements.first().text().toString();
        score = StringUtils.replace(score, "(", "");
        score = StringUtils.replace(score, ")", "");
        if (StringUtils.isNotBlank(score)) {
            try {
                video.setScore(Float.valueOf(score));
            } catch (Exception e) {
            }//from  w ww. ja  v a2 s.  c om
        }
    }
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoSingleFemaleFlag(Document doc, Video video) {
    Elements actressElements = doc.select("div#video_cast span.star");
    if (CollectionUtils.isNotEmpty(actressElements)) {
        if (actressElements.size() <= 1) {
            video.setSingleFemaleFlag(true);
        } else {/*from   ww  w.  j a  v a2 s  . c o  m*/
            video.setSingleFemaleFlag(false);
        }
    }
}

From source file:org.cellcore.code.engine.page.extractor.pkg.PKGPageDataExtractor.java

@Override
protected float getPrice(Document doc) {

    String price = ((Element) doc.getElementsByAttributeValueStarting("src", "./docs/illustrations/").get(0)
            .parent().parent().childNodes().get(5)).select("tbody").get(1).select("font").get(0).select("b")
                    .text();//from  w  w  w .ja  v  a2 s  .c om
    if (price == null || price.equals("")) {
        price = doc.select("form").get(2).select("b").get(0).text();
    }
    price = cleanPriceString(price);
    return Float.parseFloat(price);
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoPeople(Document doc, Video video) {
    Elements dElements = doc.select("div#video_director td.text");
    if (CollectionUtils.isNotEmpty(dElements)) {
        String director = dElements.first().text().toString();
        video.setDirector(director);/*from ww  w  . j  a  v a  2  s  .c  o m*/
    }

    Elements pElements = doc.select("div#video_maker td.text");
    if (CollectionUtils.isNotEmpty(pElements)) {
        String producer = pElements.first().text().toString();
        video.setProducer(producer);
    }

    Elements disElements = doc.select("div#video_label td.text");
    if (CollectionUtils.isNotEmpty(disElements)) {
        String distributor = disElements.first().text().toString();
        video.setDistributor(distributor);
    }
}