Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:de.geeksfactory.opacclient.apis.Zones.java

static String findNextPageUrl(Document doc) {
    if (doc.select(".pageNavLink[title*=nchsten]").size() > 0) {
        Element link = doc.select(".pageNavLink[title*=nchsten]").first();
        return link.absUrl("href");
    } else {/*from  w ww  .  ja v a  2  s.c o m*/
        return null;
    }
}

From source file:io.seldon.importer.articles.FileItemAttributesImporter.java

public static Map<String, String> getAttributes(String url, String existingCategory) {
    ItemProcessResult itemProcessResult = new ItemProcessResult();
    itemProcessResult.client_item_id = url;
    itemProcessResult.extraction_status = "EXTRACTION_FAILED";

    logger.info("Trying to get attributes for " + url);
    Map<String, String> attributes = null;
    String title = "";
    String category = "";
    String subCategory = "";
    String img_url = "";
    String description = "";
    String tags = "";
    String leadtext = "";
    String link = "";
    String publishDate = "";
    String domain = "";
    try {/*w w  w  .j  a  v a  2 s.  c om*/
        long now = System.currentTimeMillis();
        long timeSinceLastRequest = now - lastUrlFetchTime;
        if (timeSinceLastRequest < minFetchGapMsecs) {
            long timeToSleep = minFetchGapMsecs - timeSinceLastRequest;
            logger.info(
                    "Sleeping " + timeToSleep + "msecs as time since last fetch is " + timeSinceLastRequest);
            Thread.sleep(timeToSleep);
        }
        Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get();
        lastUrlFetchTime = System.currentTimeMillis();
        //get IMAGE URL
        if (StringUtils.isNotBlank(imageCssSelector)) {
            Element imageElement = articleDoc.select(imageCssSelector).first();
            if (imageElement != null) {
                if (imageElement.attr("content") != null) {
                    img_url = imageElement.attr("content");
                }
                if (StringUtils.isBlank(img_url) && imageElement.attr("src") != null) {
                    img_url = imageElement.attr("src");
                }
                if (StringUtils.isBlank(img_url) && imageElement.attr("href") != null) {
                    img_url = imageElement.attr("href");
                }

            }
        }
        if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl)) {
            logger.info("Setting image to default: " + defImageUrl);
            img_url = defImageUrl;
        }
        img_url = StringUtils.strip(img_url);

        //get TITLE
        if (StringUtils.isNotBlank(titleCssSelector)) {
            Element titleElement = articleDoc.select(titleCssSelector).first();
            if (titleElement != null && titleElement.attr("content") != null) {
                title = titleElement.attr("content");
            }
        }

        //get Lead Text
        if (StringUtils.isNotBlank(leadTextCssSelector)) {
            Element leadElement = articleDoc.select(leadTextCssSelector).first();
            if (leadElement != null && leadElement.attr("content") != null) {
                leadtext = leadElement.attr("content");
            }
        }

        //get publish date
        if (StringUtils.isNotBlank(publishDateCssSelector)) {
            //2013-01-21T10:40:55Z
            Element pubElement = articleDoc.select(publishDateCssSelector).first();
            if (pubElement != null && pubElement.attr("content") != null) {
                String pubtext = pubElement.attr("content");
                SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
                Date result = null;
                try {
                    result = df.parse(pubtext);
                } catch (ParseException e) {
                    logger.info("Failed to parse date withUTC format " + pubtext);
                }
                //try a simpler format
                df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
                try {
                    result = df.parse(pubtext);
                } catch (ParseException e) {
                    logger.info("Failed to parse date " + pubtext);
                }

                if (result != null)
                    publishDate = dateFormatter.format(result);
                else
                    logger.error("Failed to parse date " + pubtext);
            }
        }

        //get Link
        if (StringUtils.isNotBlank(linkCssSelector)) {
            Element linkElement = articleDoc.select(linkCssSelector).first();
            if (linkElement != null && linkElement.attr("content") != null) {
                link = linkElement.attr("content");
            }
        }

        //get CONTENT
        if (StringUtils.isNotBlank(textCssSelector)) {
            Element descriptionElement = articleDoc.select(textCssSelector).first();
            if (descriptionElement != null)
                description = Jsoup.parse(descriptionElement.html()).text();
        }

        //get TAGS
        Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title);

        if (tagSet.size() > 0)
            tags = CollectionTools.join(tagSet, ",");

        //get CATEGORY - client specific
        if (StringUtils.isNotBlank(categoryCssSelector)) {
            Element categoryElement = articleDoc.select(categoryCssSelector).first();
            if (categoryElement != null && categoryElement.attr("content") != null) {
                category = categoryElement.attr("content");
                if (StringUtils.isNotBlank(category))
                    category = category.toUpperCase();
            }
        } else if (StringUtils.isNotBlank(categoryClassPrefix)) {
            String className = "io.seldon.importer.articles.category." + categoryClassPrefix
                    + "CategoryExtractor";
            Class<?> clazz = Class.forName(className);
            Constructor<?> ctor = clazz.getConstructor();
            CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
            category = extractor.getCategory(url, articleDoc);
        }

        //get Sub CATEGORY - client specific
        if (StringUtils.isNotBlank(subCategoryCssSelector)) {
            Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first();
            if (subCategoryElement != null && subCategoryElement.attr("content") != null) {
                subCategory = subCategoryElement.attr("content");
                if (StringUtils.isNotBlank(subCategory))
                    subCategory = category.toUpperCase();
            }
        } else if (StringUtils.isNotBlank(subCategoryClassPrefix)) {
            String className = "io.seldon.importer.articles.category." + subCategoryClassPrefix
                    + "SubCategoryExtractor";
            Class<?> clazz = Class.forName(className);
            Constructor<?> ctor = clazz.getConstructor();
            CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
            subCategory = extractor.getCategory(url, articleDoc);
        }

        // Get domain
        if (domainIsNeeded) {
            domain = getDomain(url);
        }

        if (StringUtils.isNotBlank(title) && (imageNotNeeded || StringUtils.isNotBlank(img_url))
                && (categoryNotNeeded || StringUtils.isNotBlank(category))
                && (!domainIsNeeded || StringUtils.isNotBlank(domain))) {
            attributes = new HashMap<String, String>();
            attributes.put(TITLE, title);
            if (StringUtils.isNotBlank(category))
                attributes.put(CATEGORY, category);
            if (StringUtils.isNotBlank(subCategory))
                attributes.put(SUBCATEGORY, subCategory);
            if (StringUtils.isNotBlank(link))
                attributes.put(LINK, link);
            if (StringUtils.isNotBlank(leadtext))
                attributes.put(LEAD_TEXT, leadtext);
            if (StringUtils.isNotBlank(img_url))
                attributes.put(IMG_URL, img_url);
            if (StringUtils.isNotBlank(tags))
                attributes.put(TAGS, tags);
            attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE);
            if (StringUtils.isNotBlank(description))
                attributes.put(DESCRIPTION, description);
            if (StringUtils.isNotBlank(publishDate))
                attributes.put(PUBLISH_DATE, publishDate);
            if (StringUtils.isNotBlank(domain))
                attributes.put(DOMAIN, domain);
            System.out.println("Item: " + url + "; Category: " + category);
            itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED";
        } else {
            logger.warn("Failed to get title for article " + url);
            logger.warn("[title=" + title + ", img_url=" + img_url + ", category=" + category + ", domain="
                    + domain + "]");
        }

        { // check for failures for the log result
            if (StringUtils.isBlank(title)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "title";
            }
            if (!imageNotNeeded && StringUtils.isBlank(img_url)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",") + "img_url";
            }
            if (!categoryNotNeeded && StringUtils.isBlank(category)) {
                itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list
                        + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list)) ? "" : ",")
                        + "category";
            }
        }
    } catch (Exception e) {
        logger.warn("Article: " + url + ". Attributes import FAILED", e);
        itemProcessResult.error = e.toString();
    }

    AttributesImporterUtils.logResult(logger, itemProcessResult);

    return attributes;
}

From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java

private static String reformatXHtml(final String inputXhtml,
        final Map<String, ConfluenceLink> confluenceLinkMap) {
    final Document document = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser());
    document.outputSettings().prettyPrint(false);
    document.outputSettings().escapeMode(xhtml);
    document.outputSettings().charset("UTF-8");

    final Elements linkElements = document.select("a");

    for (final Element linkElement : linkElements) {
        final String originalHref = linkElement.attr("href");
        final ConfluenceLink confluenceLink = confluenceLinkMap.get(originalHref);

        if (confluenceLink == null) {
            LOG.debug("NO LINK MAPPING FOUND TO COVERT LINK: {}", originalHref);
            continue;
        }//from   www.  ja  va2  s  .  c  o  m

        final String confluenceLinkMarkup = confluenceLink.getConfluenceLinkMarkup();

        LOG.debug("LINK CONVERSION: {} -> {}", originalHref, confluenceLinkMarkup);

        linkElement.before(confluenceLinkMarkup);

        linkElement.html("");
        linkElement.unwrap();
    }

    reformatXHtmlHeadings(document, "h2");
    reformatXHtmlHeadings(document, "h3");
    reformatXHtmlHeadings(document, "#toctitle");

    final SwaggerConfluenceConfig swaggerConfluenceConfig = SWAGGER_CONFLUENCE_CONFIG.get();

    if (swaggerConfluenceConfig.getPaginationMode() == PaginationMode.SINGLE_PAGE) {
        if (swaggerConfluenceConfig.isIncludeTableOfContentsOnSinglePage()) {
            reformatXHtmlBreakAfterElements(document, "#toc");
        }

        reformatXHtmlBreakAfterElements(document, ".sect1");
    }

    reformatXHtmlSpacing(document.select(".sect2"));
    reformatXHtmlSpacing(document.select(".sect3"));

    return document.html();
}

From source file:de.luhmer.owncloudnewsreader.reader.GoogleReaderApi.GoogleReaderMethods.java

public static ArrayList<String[]> getTagList(String _USERNAME, String _PASSWORD) {
    Log.d(GoogleReaderConstants.APP_NAME, "METHOD: getTagList()");
    ArrayList<String[]> _TAGTITLE_ARRAYLIST = new ArrayList<String[]>();
    String _TAG_LABEL = null;/*from  www  .  j av  a2  s.c o m*/
    try {
        _TAG_LABEL = "user/" + AuthenticationManager.getGoogleUserID(_USERNAME, _PASSWORD) + "/label/";
    } catch (IOException e) {
        e.printStackTrace();
    }

    Document doc = null;
    try {
        doc = Jsoup.connect(GoogleReaderConstants._TAG_LIST_URL)
                .header("Authorization",
                        GoogleReaderConstants._AUTHPARAMS
                                + AuthenticationManager.getGoogleAuthKey(_USERNAME, _PASSWORD))
                .userAgent(GoogleReaderConstants.APP_NAME).timeout(6000).get();
    } catch (IOException e) {
        e.printStackTrace();
    }

    Elements links = doc.select("string");
    for (Element link : links) {
        //String tagAttrib = link.attr("name");
        String tagText = link.text();
        if (Func_Strings.FindWordInString(tagText, _TAG_LABEL)) {
            _TAGTITLE_ARRAYLIST.add(new String[] { tagText.substring(32), tagText });
        }
    }

    //String[] _TAGTITLE_ARRAY = new String[_TAGTITLE_ARRAYLIST.size()];
    //_TAGTITLE_ARRAYLIST.toArray(_TAGTITLE_ARRAY);
    //return _TAGTITLE_ARRAY;
    return _TAGTITLE_ARRAYLIST;
}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

static void parseResList(List<ReservedItem> media, Document doc, JSONObject data) {
    if (doc.select("a[name=RES]").size() == 0)
        return;/*  www  .  ja  v a  2s .  com*/
    Elements copytrs = doc.select("a[name=RES] ~ table:contains(Titel)").first().select("tr");
    doc.setBaseUri(data.optString("baseurl"));
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    int trs = copytrs.size();
    if (trs < 2) {
        return;
    }
    assert (trs > 0);
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        ReservedItem item = new ReservedItem();

        item.setTitle(tr.child(0).text().trim().replace("\u00a0", ""));
        item.setAuthor(tr.child(1).text().trim().replace("\u00a0", ""));
        try {
            item.setReadyDate(fmt.parseLocalDate(tr.child(4).text().trim().replace("\u00a0", "")));
        } catch (IllegalArgumentException e) {
            item.setStatus(tr.child(4).text().trim().replace("\u00a0", ""));
        }
        if (tr.select("a").size() > 0) {
            item.setCancelData(tr.select("a").last().attr("href"));
        }

        media.add(item);
    }
    assert (media.size() == trs - 1);

}

From source file:com.astamuse.asta4d.render.RenderUtil.java

private final static void applySnippetResultToElement(Document doc, String snippetRefId, Element snippetElement,
        Element renderTarget, Renderer renderer) {
    apply(renderTarget, renderer);//from  w w w .  j  a v a2s  . c  o m
    if (snippetElement.ownerDocument() == null) {
        // it means this snippet element is replaced by a
        // element completely
        String reSelector = SelectorUtil.attr(ExtNodeConstants.SNIPPET_NODE_TAG_SELECTOR,
                ExtNodeConstants.ATTR_SNIPPET_REF, snippetRefId);
        Elements elems = doc.select(reSelector);
        if (elems.size() > 0) {
            snippetElement = elems.get(0);
        } else {
            snippetElement = null;
        }
    }
    if (snippetElement != null) {
        snippetElement.attr(ExtNodeConstants.SNIPPET_NODE_ATTR_STATUS,
                ExtNodeConstants.SNIPPET_NODE_ATTR_STATUS_FINISHED);
    }
}

From source file:de.geeksfactory.opacclient.apis.Zones.java

static List<ReservedItem> parseResList(Document doc) {
    List<ReservedItem> reservations = new ArrayList<>();
    for (Element table : doc.select(
            ".MessageBrowseItemDetailsCell table, " + ".MessageBrowseItemDetailsCellStripe" + " table")) {
        ReservedItem item = new ReservedItem();

        for (Element tr : table.select("tr")) {
            String desc = tr.select(".MessageBrowseFieldNameCell").text().trim();
            String value = tr.select(".MessageBrowseFieldDataCell").text().trim();
            if (desc.equals("Titel"))
                item.setTitle(value);//from   w ww  .  ja va2s. c  om
            if (desc.equals("Publikationsform"))
                item.setFormat(value);
            if (desc.equals("Liefern an"))
                item.setBranch(value);
            if (desc.equals("Status"))
                item.setStatus(value);
        }
        if ("Gelscht".equals(item.getStatus()))
            continue;
        reservations.add(item);
    }
    return reservations;
}

From source file:org.cellcore.code.engine.page.extractor.mtgf.MTGFPageDataExtractor.java

@Override
protected int getStock(Document doc) {
    if (!doc.select(".card-buy").select("option").isEmpty()) {
        String val = doc.select(".card-buy").select("option").last().text();
        return Integer.parseInt(val);
    }/* w w w .j  av a 2 s  .co  m*/
    return 0;
}

From source file:org.cellcore.code.engine.page.extractor.mfrag.MFRAGPageDataExtractor.java

@Override
protected String[] getOtherNames(Document doc) {
    String fr = doc.select(".prod-det_s-titre").text();
    return new String[] { fr };
}

From source file:org.cellcore.code.engine.page.extractor.mfrag.MFRAGPageDataExtractor.java

@Override
protected float getPrice(Document doc) {
    String var = doc.select(".prod-det_prix").text();
    return Float.parseFloat(this.cleanPriceString(var));
}