Example usage for org.jsoup.nodes Element select

List of usage examples for org.jsoup.nodes Element select

Introduction

In this page you can find the example usage for org.jsoup.nodes Element select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:org.opens.tanaguru.rules.rgaa30.Rgaa30Rule110102.java

/**
 * This method linked each label which have an input child on a page to its
 * form in a map./*from w ww  .  j  av a 2 s  .c o m*/
 */
private void putLabelElementHandlerIntoTheMap() {
    for (Element el : labelElementHandler.get()) {
        Element tmpElement = el.parent();
        while (StringUtils.isNotBlank(tmpElement.tagName())) {
            if (tmpElement.tagName().equals(FORM_TAG)) {
                if (labelFormMap.containsKey(tmpElement)) {
                    Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY);
                    if (!els.isEmpty()) {
                        labelFormMap.get(tmpElement).add(el);
                    }
                } else {
                    Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY);
                    if (!els.isEmpty()) {
                        ElementHandler<Element> labelElement = new ElementHandlerImpl();
                        labelElement.add(el);
                        labelFormMap.put(tmpElement, labelElement);
                    }
                }
                break;
            }
            tmpElement = tmpElement.parent();
        }
    }
}

From source file:org.sakaiproject.nakamura.files.migrator.PageMigrator.java

boolean isEmpty(Element htmlElement) {
    // filter out TinyMCE instances
    htmlElement.select(".mceEditor").remove();
    String htmlContent = htmlElement.text().trim();
    String[] elementNames = new String[] { "img", "iframe", "frame", "input", "select", "option" };
    boolean containsElement = false;
    for (String elementName : elementNames) {
        if (!htmlElement.select(elementName).isEmpty()) {
            containsElement = true;/*from   w w w  .  j  ava  2s.co m*/
        }
    }
    return !(htmlElement.hasText() || containsElement);
}

From source file:org.sakaiproject.nakamura.files.migrator.PageMigrator.java

JSONObject migratePage(JSONObject originalStructure, String contentId, Set<String> widgetsUsed, String ref)
        throws JSONException {
    Document page;/*from  w w w .j a  v  a  2s  . co m*/
    JSONObject oldFashionedWidget = originalStructure.getJSONObject(ref);
    if (oldFashionedWidget.has("page")) {
        page = Jsoup.parse(oldFashionedWidget.getString("page"));
        if (originalStructure.has("resources")) {
            LOGGER.debug("This may be an IMS-CP document. Looking for images...");
            Elements images = page.select("img");
            LOGGER.debug("Found {} images to process.", images.size());
            for (Element image : images) {
                String imgSrc = image.attr("src");
                LOGGER.debug("Looking at image path '{}'", imgSrc);
                if (imgSrc.startsWith("p/") && hasImageExtension(imgSrc)) {
                    image.attr("src", imgSrc.substring(0, imgSrc.lastIndexOf(".") + 1));
                }
            }
        }
    } else {
        page = Jsoup.parse(EMPTY_DIV);
    }
    Document currentHtmlBlock = Jsoup.parse(EMPTY_DIV);
    JSONObject currentPage = new JSONObject();
    currentPage.put("rows", new JSONArray());
    JSONObject currentRow = generateEmptyRow(1);
    Elements topLevelElements = page.select("body").first().children();
    for (Element topLevelElement : topLevelElements) {
        if (topLevelElement.select(".widget_inline").size() > 0) {
            addRowToPage(currentRow, currentPage, 0, currentHtmlBlock.select("body").first());
            currentHtmlBlock = Jsoup.parse(EMPTY_DIV);
            int numColumns = 1;
            int leftSideColumn = topLevelElement.select(".widget_inline.block_image_left").size() > 0 ? 1 : 0;
            numColumns += leftSideColumn;
            int rightSideColumn = topLevelElement.select(".widget_inline.block_image_right").size() > 0 ? 1 : 0;
            numColumns += rightSideColumn;
            if (numColumns > 1) {
                currentRow = addRowToPage(currentRow, currentPage, numColumns,
                        currentHtmlBlock.select("body").first());
            }
            for (Element widgetElement : topLevelElement.select(".widget_inline")) {
                extractWidget(originalStructure, contentId, widgetsUsed, ref, currentPage, currentRow,
                        leftSideColumn, widgetElement);
            }

            if (numColumns > 1) {
                currentRow = addRowToPage(currentRow, currentPage, 1, currentHtmlBlock.select("body").first());
            }

        } else {
            currentHtmlBlock.select("div").first().appendChild(topLevelElement);
        }
    }
    addRowToPage(currentRow, currentPage, 1, currentHtmlBlock.select("body").first());
    ensureRowPresent(currentPage);

    return currentPage;
}

From source file:org.sbs.goodcrawler.jobconf.ExtractConfig.java

/**
 * ????/*  www .  j a v  a2 s .  c om*/
 * @param doc
 * @return
 * @throws ConfigurationException
 */
public ExtractConfig loadConfig(Document doc) throws ConfigurationException {
    Elements extractElement = doc.select("extract");
    super.jobName = doc.select("job").attr("name");
    super.indexName = doc.select("job").attr("indexName");
    String temp = extractElement.select("threadNum").text();
    if (StringUtils.isNotBlank(temp)) {
        this.threadNum = Integer.parseInt(temp);
    }

    Elements templateElement = extractElement.select("extract").select("template");
    Iterator<Element> it = templateElement.iterator();
    while (it.hasNext()) {
        Element template = it.next();
        ExtractTemplate extractTemplate = new ExtractTemplate();
        // ?Url????
        Elements urlPatternElement = template.select("url");
        List<Pattern> patterns = Lists.newArrayList();
        for (Element urlElement : urlPatternElement) {
            patterns.add(Pattern.compile(urlElement.text()));
        }
        extractTemplate.setUrlPattern(patterns);
        extractTemplate.setName(template.attr("name"));
        // ???
        Elements selectElement = template.select("elements").first().children();
        for (Element element : selectElement) {
            if ("element".equals(element.tagName())) {
                AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element);
                extractTemplate.addCssSelector(selector);
            } else if ("if".equals(element.tagName())) {
                IFConditions ifConditions = IFConditions.create(element);
                extractTemplate.addConditions(ifConditions);
            }
        }
        this.templates.add(extractTemplate);
    }
    return this;
}

From source file:org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java

@Override
public ExtractedPage<?, ?> onExtract(Page page) {
    if (null != page) {
        try {/*from  ww  w  .  ja  va  2s . c  om*/

            Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                    urlUtils.getBaseUrl(page.getWebURL().getURL()));
            if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/"))
                return null;
            // ???Url?Url
            Elements links = doc.getElementsByTag("a");
            if (!links.isEmpty()) {
                for (Element link : links) {
                    String linkHref = link.absUrl("href");
                    if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) {
                        try {
                            WebURL url = new WebURL();

                            url.setURL(linkHref);
                            url.setJobName(conf.jobName);
                            pendingUrls.addUrl(url);
                        } catch (QueueException e) {
                            log.error(e.getMessage());
                        } catch (Exception e) {
                            log.error(e.getMessage());
                        }
                    }
                }
            }
            // ??
            //            Map<String, String> selects = conf.getSelects();
            Map<String, String> selects = null;
            ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>();
            epage.setUrl(page.getWebURL());
            HashMap<String, Object> result = new HashMap<>();
            Elements text = doc.select("#Zoom");
            if (null == text || text.size() == 0) {
                return null;
            }
            String name = doc.select("h1").text();
            name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", "");
            result.put("movie", name);
            //            result.put("_id", name);
            String ts[] = doc.select("h2 a").text().split(" ");
            if (ts.length >= 2) {
                result.put("type", ts[1].trim());
            } else {
                result.put("type", "unknow");
            }
            result.put("url", page.getWebURL().getURL());
            for (Entry<String, String> entry : selects.entrySet()) {
                Elements elements = doc.select(entry.getValue());
                if (elements.isEmpty())
                    return null;
                else {
                    if ("content".equals(entry.getKey())) {

                        for (Element element : elements) {
                            // 
                            Elements imgs = element.select("img[src]");
                            StringBuilder sb = new StringBuilder();
                            for (Element img : imgs) {
                                sb.append(img.attr("src")).append(";");
                            }
                            result.put("img", sb.toString());
                            // ?
                            Elements movieInfos = element.select("p");
                            for (Element info : movieInfos) {
                                String infotext = info.text();
                                try {
                                    String infotext_ = info.html();
                                    int start, end = 0;
                                    start = infotext_.indexOf("");
                                    if (start > 0) {
                                        end = infotext_.lastIndexOf("");
                                        if (end > 0 && start < end) {
                                            result.put("jq", infotext_.substring(start, end));
                                        } else {
                                            end = infotext_.lastIndexOf(".");
                                            if (end > 0 && start < end) {
                                                result.put("jq", infotext_.substring(start, end));
                                            }
                                        }
                                    }
                                    infotext_ = null;
                                } catch (Exception e) {
                                    e.printStackTrace();
                                }

                                if (infotext.startsWith("")) {
                                    String ss[] = infotext.split("");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.startsWith("?")) {
                                    String ss[] = infotext.split("?");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.contains("")) {
                                    infotext = info.html();
                                    String[] ss = infotext.split("<br />");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.contains(":")) {
                                    infotext = info.html();
                                    String[] ss = infotext.split("<br />");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                }
                            }

                            //                        if(result.size()<5){
                            //                           result.put("content", value)
                            //                        }

                            // ?
                            Elements elements2 = elements.select("td");
                            sb.setLength(0);
                            for (Element download : elements2) {
                                sb.append(download.text()).append(";");
                            }
                            result.put("download", sb.toString());
                        }
                    }
                }
                //               result.put(entry.getKey(), elements.html());
            }
            if (StringUtils.isNotBlank((String) result.get("nd"))) {
                result.put("nd", Integer.parseInt((String) result.get("nd")));
            }
            epage.setMessages(result);
            try {
                pendingStore.addExtracedPage(epage);
            } catch (QueueException e) {
                log.error(e.getMessage());
            }
            return epage;
        } catch (UnsupportedEncodingException e) {
            log.error(e.getMessage());
            e.printStackTrace();
        }
    }
    return null;
}

From source file:org.schabi.newpipe.extractor.services.youtube.YoutubeStreamExtractor.java

@Override
public int getLikeCount() throws ParsingException {
    String likesString = "";
    try {/*from ww  w  .  ja v  a 2  s .  co  m*/

        Element button = doc.select("button.like-button-renderer-like-button").first();
        try {
            likesString = button.select("span.yt-uix-button-content").first().text();
        } catch (NullPointerException e) {
            //if this ckicks in our button has no content and thefore likes/dislikes are disabled
            return -1;
        }
        return Integer.parseInt(likesString.replaceAll("[^\\d]", ""));
    } catch (NumberFormatException nfe) {
        throw new ParsingException("failed to parse likesString \"" + likesString + "\" as integers", nfe);
    } catch (Exception e) {
        throw new ParsingException("Could not get like count", e);
    }
}

From source file:org.schabi.newpipe.extractor.services.youtube.YoutubeStreamExtractor.java

@Override
public int getDislikeCount() throws ParsingException {
    String dislikesString = "";
    try {// w  w  w  .  ja v  a  2s  . c o  m
        Element button = doc.select("button.like-button-renderer-dislike-button").first();
        try {
            dislikesString = button.select("span.yt-uix-button-content").first().text();
        } catch (NullPointerException e) {
            //if this kicks in our button has no content and therefore likes/dislikes are disabled
            return -1;
        }
        return Integer.parseInt(dislikesString.replaceAll("[^\\d]", ""));
    } catch (NumberFormatException nfe) {
        throw new ParsingException("failed to parse dislikesString \"" + dislikesString + "\" as integers",
                nfe);
    } catch (Exception e) {
        throw new ParsingException("Could not get dislike count", e);
    }
}

From source file:org.schabi.newpipe.extractor.services.youtube.YoutubeStreamExtractor.java

@Override
public Vector<StreamPreviewInfo> getRelatedVideos() throws ParsingException {
    try {/*from www . j  a va2 s.c o  m*/
        Vector<StreamPreviewInfo> relatedVideos = new Vector<>();
        for (Element li : doc.select("ul[id=\"watch-related\"]").first().children()) {
            // first check if we have a playlist. If so leave them out
            if (li.select("a[class*=\"content-link\"]").first() != null) {
                relatedVideos.add(extractVideoPreviewInfo(li));
            }
        }
        return relatedVideos;
    } catch (Exception e) {
        throw new ParsingException("Could not get related videos", e);
    }
}

From source file:org.schabi.newpipe.extractor.services.youtube.YoutubeStreamExtractor.java

/**Provides information about links to other videos on the video page, such as related videos.
 * This is encapsulated in a StreamPreviewInfo object,
 * which is a subset of the fields in a full StreamInfo.*/
private StreamPreviewInfo extractVideoPreviewInfo(Element li) throws ParsingException {
    StreamPreviewInfo info = new StreamPreviewInfo();

    try {//w ww  .j a  v a  2 s  .  com
        info.webpage_url = li.select("a.content-link").first().attr("abs:href");

        info.id = Parser.matchGroup1("v=([0-9a-zA-Z-]*)", info.webpage_url);

        //todo: check NullPointerException causing
        info.title = li.select("span.title").first().text();
        //this page causes the NullPointerException, after finding it by searching for "tjvg":
        //https://www.youtube.com/watch?v=Uqg0aEhLFAg

        //this line is unused
        //String views = li.select("span.view-count").first().text();

        //Log.i(TAG, "title:"+info.title);
        //Log.i(TAG, "view count:"+views);

        try {
            info.view_count = Long
                    .parseLong(li.select("span.view-count").first().text().replaceAll("[^\\d]", ""));
        } catch (Exception e) {//related videos sometimes have no view count
            info.view_count = 0;
        }
        info.uploader = li.select("span.g-hovercard").first().text();

        info.duration = YoutubeParsingHelper.parseDurationString(li.select("span.video-time").first().text());

        Element img = li.select("img").first();
        info.thumbnail_url = img.attr("abs:src");
        // Sometimes youtube sends links to gif files which somehow seem to not exist
        // anymore. Items with such gif also offer a secondary image source. So we are going
        // to use that if we caught such an item.
        if (info.thumbnail_url.contains(".gif")) {
            info.thumbnail_url = img.attr("data-thumb");
        }
        if (info.thumbnail_url.startsWith("//")) {
            info.thumbnail_url = "https:" + info.thumbnail_url;
        }
    } catch (Exception e) {
        throw new ParsingException("Could not get video preview info", e);
    }
    return info;
}

From source file:org.schabi.newpipe.services.youtube.YoutubeVideoExtractor.java

@Override
public VideoInfo getVideoInfo() {
    videoInfo = super.getVideoInfo();
    //todo: replace this with a call to getVideoId, if possible
    videoInfo.id = matchGroup1("v=([0-9a-zA-Z_-]{11})", pageUrl);

    videoInfo.age_limit = 0;/*from  w w  w  . j av  a 2  s . c o m*/

    //average rating
    try {
        videoInfo.average_rating = playerArgs.getString("avg_rating");
    } catch (JSONException e) {
        e.printStackTrace();
    }

    //---------------------------------------
    // extracting information from html page
    //---------------------------------------

    // Determine what went wrong when the Video is not available
    if (videoInfo.videoAvailableStatus == VideoInfo.VIDEO_UNAVAILABLE) {
        if (doc.select("h1[id=\"unavailable-message\"]").first().text().contains("GEMA")) {
            videoInfo.videoAvailableStatus = VideoInfo.VIDEO_UNAVAILABLE_GEMA;
        }
    }

    String likesString = "";
    String dislikesString = "";
    try {
        // likes
        likesString = doc.select("button.like-button-renderer-like-button").first()
                .select("span.yt-uix-button-content").first().text();
        videoInfo.like_count = Integer.parseInt(likesString.replaceAll("[^\\d]", ""));
        // dislikes
        dislikesString = doc.select("button.like-button-renderer-dislike-button").first()
                .select("span.yt-uix-button-content").first().text();

        videoInfo.dislike_count = Integer.parseInt(dislikesString.replaceAll("[^\\d]", ""));
    } catch (NumberFormatException nfe) {
        Log.e(TAG, "failed to parse likesString \"" + likesString + "\" and dislikesString \"" + dislikesString
                + "\" as integers");
    } catch (Exception e) {
        // if it fails we know that the video does not offer dislikes.
        e.printStackTrace();
        videoInfo.like_count = 0;
        videoInfo.dislike_count = 0;
    }

    // next video
    videoInfo.nextVideo = extractVideoPreviewInfo(
            doc.select("div[class=\"watch-sidebar-section\"]").first().select("li").first());

    // related videos
    Vector<VideoPreviewInfo> relatedVideos = new Vector<>();
    for (Element li : doc.select("ul[id=\"watch-related\"]").first().children()) {
        // first check if we have a playlist. If so leave them out
        if (li.select("a[class*=\"content-link\"]").first() != null) {
            relatedVideos.add(extractVideoPreviewInfo(li));
        }
    }
    //todo: replace conversion
    videoInfo.relatedVideos = relatedVideos;
    //videoInfo.relatedVideos = relatedVideos.toArray(new VideoPreviewInfo[relatedVideos.size()]);
    return videoInfo;
}