Example usage for org.jsoup.nodes Element html

List of usage examples for org.jsoup.nodes Element html

Introduction

In this page you can find the example usage for org.jsoup.nodes Element html.

Prototype

public String html() 

Source Link

Document

Retrieves the element's inner HTML.

Usage

From source file:org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java

@Override
public ExtractedPage<?, ?> onExtract(Page page) {
    if (null != page) {
        try {/*from ww w  . ja va 2  s .  co m*/

            Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                    urlUtils.getBaseUrl(page.getWebURL().getURL()));
            if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/"))
                return null;
            // ???Url?Url
            Elements links = doc.getElementsByTag("a");
            if (!links.isEmpty()) {
                for (Element link : links) {
                    String linkHref = link.absUrl("href");
                    if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) {
                        try {
                            WebURL url = new WebURL();

                            url.setURL(linkHref);
                            url.setJobName(conf.jobName);
                            pendingUrls.addUrl(url);
                        } catch (QueueException e) {
                            log.error(e.getMessage());
                        } catch (Exception e) {
                            log.error(e.getMessage());
                        }
                    }
                }
            }
            // ??
            //            Map<String, String> selects = conf.getSelects();
            Map<String, String> selects = null;
            ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>();
            epage.setUrl(page.getWebURL());
            HashMap<String, Object> result = new HashMap<>();
            Elements text = doc.select("#Zoom");
            if (null == text || text.size() == 0) {
                return null;
            }
            String name = doc.select("h1").text();
            name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", "");
            result.put("movie", name);
            //            result.put("_id", name);
            String ts[] = doc.select("h2 a").text().split(" ");
            if (ts.length >= 2) {
                result.put("type", ts[1].trim());
            } else {
                result.put("type", "unknow");
            }
            result.put("url", page.getWebURL().getURL());
            for (Entry<String, String> entry : selects.entrySet()) {
                Elements elements = doc.select(entry.getValue());
                if (elements.isEmpty())
                    return null;
                else {
                    if ("content".equals(entry.getKey())) {

                        for (Element element : elements) {
                            // 
                            Elements imgs = element.select("img[src]");
                            StringBuilder sb = new StringBuilder();
                            for (Element img : imgs) {
                                sb.append(img.attr("src")).append(";");
                            }
                            result.put("img", sb.toString());
                            // ?
                            Elements movieInfos = element.select("p");
                            for (Element info : movieInfos) {
                                String infotext = info.text();
                                try {
                                    String infotext_ = info.html();
                                    int start, end = 0;
                                    start = infotext_.indexOf("");
                                    if (start > 0) {
                                        end = infotext_.lastIndexOf("");
                                        if (end > 0 && start < end) {
                                            result.put("jq", infotext_.substring(start, end));
                                        } else {
                                            end = infotext_.lastIndexOf(".");
                                            if (end > 0 && start < end) {
                                                result.put("jq", infotext_.substring(start, end));
                                            }
                                        }
                                    }
                                    infotext_ = null;
                                } catch (Exception e) {
                                    e.printStackTrace();
                                }

                                if (infotext.startsWith("")) {
                                    String ss[] = infotext.split("");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.startsWith("?")) {
                                    String ss[] = infotext.split("?");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.contains("")) {
                                    infotext = info.html();
                                    String[] ss = infotext.split("<br />");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                } else if (infotext.contains(":")) {
                                    infotext = info.html();
                                    String[] ss = infotext.split("<br />");
                                    for (String s : ss) {
                                        s.trim();
                                        result = getInfoName(s, result);
                                    }
                                }
                            }

                            //                        if(result.size()<5){
                            //                           result.put("content", value)
                            //                        }

                            // ?
                            Elements elements2 = elements.select("td");
                            sb.setLength(0);
                            for (Element download : elements2) {
                                sb.append(download.text()).append(";");
                            }
                            result.put("download", sb.toString());
                        }
                    }
                }
                //               result.put(entry.getKey(), elements.html());
            }
            if (StringUtils.isNotBlank((String) result.get("nd"))) {
                result.put("nd", Integer.parseInt((String) result.get("nd")));
            }
            epage.setMessages(result);
            try {
                pendingStore.addExtracedPage(epage);
            } catch (QueueException e) {
                log.error(e.getMessage());
            }
            return epage;
        } catch (UnsupportedEncodingException e) {
            log.error(e.getMessage());
            e.printStackTrace();
        }
    }
    return null;
}

From source file:org.structr.web.importer.Importer.java

public DOMNode createComponentChildNodes(final DOMNode parent, final Page page) throws FrameworkException {

    final Element head = parsedDocument.head();
    final Element body = parsedDocument.body();

    if (head != null && !head.html().isEmpty()) {

        // create Head element and append nodes to it
        final Head headElement = (Head) page.createElement("head");
        createChildNodes(head, headElement, page);

        // head is a special case
        return headElement;
    }/*from w w w  . ja  va2s . com*/

    if (body != null && !body.html().isEmpty()) {

        return createChildNodes(body, parent, page);
    }

    // fallback, no head no body => document is parent
    return createChildNodes(parsedDocument, parent, page);
}

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception {
    LOGGER.debug("search() " + options.toString());
    List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>();
    String searchUrl = "";
    String searchTerm = "";
    String imdb = "";

    // only title search
    if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search for everything: " + searchTerm);
    } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search with title: " + searchTerm);
    } else {/*from   w w  w. j  a  va 2  s . c  o  m*/
        LOGGER.debug("empty searchString");
        return resultList;
    }

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    Document doc = null;
    try {
        Url url = new CachedUrl(searchUrl);
        InputStream in = url.getInputStream();
        doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();
    } catch (Exception e) {
        LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage());

        // clear cache
        CachedUrl.removeCachedFileForUrl(searchUrl);
    }

    if (doc == null) {
        return resultList;
    }

    // only look for movie links
    Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php");
    LOGGER.debug("found " + filme.size() + " search results");
    if (filme.isEmpty()) {
        if (!doc.getElementsByTag("title").text().contains("Suche nach")) {
            // redirected to detail page
            MediaSearchResult msr = new MediaSearchResult(providerInfo.getId());
            Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id=");
            if (el.size() > 0) {
                msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)"));
            }
            msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim());
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                msr.setYear(el.get(0).text());
            }
            resultList.add(msr);
        }
        return resultList;
    }

    // <a
    // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1"
    // class="normLight">Avatar - Aufbruch nach Pandora</B>
    // <nobr>(2009)</nobr><br /><span class="smallLight"
    // style="color:#ccc;">Avatar</span></a>

    // map to merge 2 results :/
    Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>();

    for (Element a : filme) {
        try {
            String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-");
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            if (res.containsKey(id)) {
                LOGGER.debug("dupe found; merging with previous searchresult");
                sr = res.get(id);
            }

            if (StringUtils.isNotEmpty(imdb)) {
                sr.setIMDBId(imdb);
            }
            if (StringUtils.isEmpty(sr.getId())) {
                sr.setId(id);
            }
            if (StringUtils.isEmpty(sr.getTitle())) {
                if (a.html().contains("nobr")) {
                    sr.setTitle(a.ownText());
                } else {
                    sr.setTitle(a.text());
                }
            }
            LOGGER.debug("found movie " + sr.getTitle());
            if (StringUtils.isEmpty(sr.getOriginalTitle())) {
                sr.setOriginalTitle(a.getElementsByTag("span").text());
            }
            if (StringUtils.isEmpty(sr.getYear())) {
                sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any
                                                                                                 // 4
                                                                                                 // digit
            }
            sr.setMediaType(MediaType.MOVIE);
            sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id);
            // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(),
            // "images(.*?)\\&quot"));

            if (imdb.equals(sr.getIMDBId())) {
                // perfect match
                sr.setScore(1);
            } else {
                // compare score based on names
                sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle()));
            }

            // populate extra args
            MetadataUtil.copySearchQueryToSearchResult(options, sr);
            res.put(id, sr);
        } catch (Exception e) {
            LOGGER.warn("error parsing movie result: " + e.getMessage());
        }
    }
    for (String r : res.keySet()) {
        resultList.add(res.get(r));
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);
    return resultList;
}

From source file:org.wandora.application.tools.extractors.bookmark.BookmarkExtractor.java

private void parseCategory(Element c, Topic parent, TopicMap t) throws TopicMapException, ParseException {

    Topic cTopic = parent;//from   w  w w .  j  av a2 s .  c  om
    Elements children = c.children();
    for (Element child : children) {
        if (child.tagName().equals("h3")) {
            String cLocator = parent.getSubjectLocator().toString();
            cLocator += "/" + urlEncode(child.html());
            String cName = child.ownText();

            cTopic = getOrCreateTopic(t, cLocator);
            cTopic.setSubjectLocator(new Locator(cLocator));
            cTopic.setBaseName(cName + " (Bookmark)");
            cTopic.setDisplayName(LANG, cName);
            makeSubclassOf(t, cTopic, parent);
        }
    }

    for (Element child : children) {
        if (!child.tagName().equals("dl"))
            continue;

        for (Element grandChild : child.children()) {
            if (!grandChild.tagName().equals("dt"))
                continue;
            for (Element ggChild : grandChild.children()) {
                if (ggChild.tagName().equals("a"))
                    parseItem(ggChild, cTopic, t);
                else if (ggChild.tagName().equals("dl"))
                    parseCategory(grandChild, cTopic, t);
            }

        }

    }
}

From source file:webcralwerproject1.Webcrawler.java

public String contentprocessor() {
    File folder = new File(DirectoryName + "/" + crawlcount);
    FileWriter f_write = null;/*from   ww w. j  a  v  a  2  s  .com*/
    Elements p, c = null;
    String contentprocessfile = "./crawler" + crawlcount + "content.html";
    if (!folder.exists()) {
    } else {
        try {
            File[] listOfFiles = folder.listFiles();
            f_write = new FileWriter(contentprocessfile, true);

            //Open repo directory and loop through all files
            for (File file : listOfFiles) {
                if (file.isFile()) {
                    File input = new File(file.getAbsolutePath());
                    Document doc = Jsoup.parse(input, "UTF-8");
                    String title = doc.select("title").toString();
                    Elements n = doc.select("nav").remove();
                    //  String d =doc.select("div.id");
                    doc.select("head").remove();
                    doc.select("link").remove();
                    doc.select("style").remove();
                    doc.select("meta").remove();
                    doc.select("script").remove();
                    doc.select("figure").remove();
                    doc.select("img").remove();
                    doc.select("footer").remove();
                    doc.select("input[type = search]").remove();
                    doc.select("form").remove();
                    doc.select("button").remove();
                    doc.select("video").remove();
                    doc.select("div:empty").remove();
                    doc.select("div#footer").remove();
                    doc.select("div#id").remove();
                    doc.select("div#nav").remove();
                    doc.select("div#navigation").remove();
                    doc.select("div.footer").remove();
                    doc.select("div.header").remove();
                    doc.select("li > a[href]").remove();

                    Elements linksOnPage = doc.select("body a[href]");
                    for (Element link : linksOnPage) {
                        if (link.html() == null) {
                            link.remove();//<a></a>
                        } else if (link.html().length() <= 4) {// does not contains title of the page 
                            link.remove();
                        } else {
                            int child = link.parentNode().childNodeSize();
                            if (child == 1) {//only element remove
                                link.remove();
                            }
                        }
                    }
                    f_write.write(doc.text());
                }
                f_write.write("<br>");
            }
            f_write.close();
        } catch (Exception e) {
            System.out.println("Inside Contentprocessor" + e);
        }

        return contentprocessfile;
    }
    return null;
}

From source file:xxx.web.comments.debates.impl.ProConOrgParser.java

@Override
public Debate parseDebate(InputStream inputStream) throws IOException {
    Debate result = new Debate();

    Document doc = Jsoup.parse(inputStream, "UTF-8", "http://www.procon.org/");

    // Set the Url of the doc

    // title//from w  w w.ja  v  a2  s  .  c  om
    Element body = doc.body();
    Elements debateTitleElements = body.select("h2");
    //        Elements debateTitleElements = body.select("p[class=title]").select("p[style]");

    if (debateTitleElements.first() == null) {
        // not a debate
        return null;
    }

    String title = Utils.normalize(debateTitleElements.first().text());
    result.setTitle(title);

    Elements proConTr = body.select("tr > td > b:contains(PRO \\(yes\\))");

    if (proConTr == null || proConTr.parents() == null || proConTr.parents().first() == null
            || proConTr.parents().first().parents() == null
            || proConTr.parents().first().parents().first() == null
            || proConTr.parents().first().parents().first().nextElementSibling() == null) {
        // not a pro-con debate
        return null;
    }

    Element trAnswers = proConTr.parents().first().parents().first().nextElementSibling();

    // the PRO side
    Element proTd = trAnswers.select("td").get(0);
    Element conTd = trAnswers.select("td").get(1);

    //        System.out.println(proTd.select("blockquote").size());
    //        System.out.println(conTd.select("blockquote").size());

    for (Element text : proTd.select("blockquote > div[class=editortext]")) {
        Argument argument = new Argument();
        argument.setStance("pro");
        argument.setText(extractPlainTextFromTextElement(text));
        argument.setOriginalHTML(text.html());

        // set ID
        idCounter++;
        argument.setId("pcq_" + idCounter);

        if (!argument.getText().isEmpty()) {
            result.getArgumentList().add(argument);
        } else {
            System.err.println("Failed to extract text from " + text.html());
        }
    }

    for (Element text : conTd.select("blockquote > div[class=editortext]")) {
        Argument argument = new Argument();
        argument.setStance("con");
        argument.setText(extractPlainTextFromTextElement(text));
        argument.setOriginalHTML(text.html());

        idCounter++;
        argument.setId("pcq_" + idCounter);

        if (!argument.getText().isEmpty()) {
            result.getArgumentList().add(argument);
        } else {
            System.err.println("Failed to extract text from " + text.html());
        }
    }

    // show some stats:
    Map<String, Integer> map = new HashMap<>();
    map.put("pro", 0);
    map.put("con", 0);
    for (Argument argument : result.getArgumentList()) {
        map.put(argument.getStance(), map.get(argument.getStance()) + 1);
    }
    System.out.println(map);

    return result;
}