Example usage for org.jsoup.select Elements toString

List of usage examples for org.jsoup.select Elements toString

Introduction

In this page you can find the example usage for org.jsoup.select Elements toString.

Prototype

@Override
public String toString() 

Source Link

Document

Get the combined outer HTML of all matched elements.

Usage

From source file:org.sbs.goodcrawler.extractor.selector.IntegerElementCssSelector.java

@Override
public Integer getContent() throws ExtractException {
    Elements elements = null;
    try {//from   www . ja v a2  s  .c o m
        // content???document2+??
        if (null != content && !newDoc) {
            return content;
        }
        if (null != document) {
            elements = super.document.select(value);
            if (elements.isEmpty())
                return null;
            String temp;
            switch ($Attr) {
            case text:
                temp = CharMatcher.DIGIT.retainFrom(getExtractText(elements));
                break;
            default:
                temp = CharMatcher.DIGIT.retainFrom(getExtractAttr(elements, attr));
                break;
            }

            if (StringUtils.isNotBlank(temp)) {
                Integer integer = Integer.parseInt(temp);
                if (null != actions && actions.size() > 0) {
                    for (IntegerSelectorAction action : actions) {
                        this.content = action.doAction(integer);
                    }
                } else {
                    this.content = integer;
                }
                newDoc = false;
                return content;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        log.error(elements.toString());
        throw new ExtractException("????:" + e.getMessage());
    }
    return null;
}

From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java

/**
 * Get movie meta data from aebn.net.//from w  w w . ja  v a 2s  . c o m
 *
 */
@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("AEBN: getMetadata() {}", options);

    // check if there is already meta data present in the result
    if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) {
        LOGGER.debug("AEBN: return metadata from cache");
        return options.getResult().getMediaMetadata();
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    Elements elements = null;
    Element element = null;
    Integer aebnId = 0;

    // get AebnId from previous search result
    if ((options.getResult() != null) && (options.getResult().getId() != null)) {
        aebnId = Integer.parseInt(options.getResult().getId());
        LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId);
        // preset some values from search result (if there is one)
        // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy".
        md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
                StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle()));
        md.storeMetadata(MediaMetadata.TITLE,
                StrgUtils.removeCommonSortableName(options.getResult().getTitle()));
    }

    // or get AebnId from options
    if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) {
        LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID));
        aebnId = Integer.parseInt(options.getId(AEBNID));
    }

    if (!isValidAebnId(aebnId)) {
        LOGGER.warn("AEBN: no or incorrect aebnId, aborting");
        return md;
    }

    // ID
    md.setId(providerInfo.getId(), aebnId);
    LOGGER.debug("AEBN: aebnId({})", aebnId);

    // Base download url for data scraping
    String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId;
    String locale = options.getLanguage().name();
    if (!StringUtils.isBlank(locale)) {
        downloadUrl = downloadUrl + "&locale=" + locale;
        LOGGER.debug("AEBN: used locale({})", locale);
    }

    // begin download and scrape
    try {
        LOGGER.debug("AEBN: download movie detail page");
        Url url = new Url(downloadUrl);
        InputStream in = url.getInputStream();
        Document document = Jsoup.parse(in, "UTF-8", "");
        in.close();

        // Title
        // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1>
        LOGGER.debug("AEBN: parse title");
        elements = document.getElementsByAttributeValue("class", "md-movieTitle");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            element = elements.first();
            String movieTitle = cleanString(element.text());
            LOGGER.debug("AEBN: title({})", movieTitle);
            md.storeMetadata(MediaMetadata.TITLE, movieTitle);
        }

        // Poster
        // front cover:
        // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg
        String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg";
        md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl);

        // Fanart/Background
        // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg
        // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..."
        // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." />
        LOGGER.debug("AEBN: parse fanart / scene thumbs");
        elements = document.getElementsByAttributeValue("class", "SceneThumbnail");
        LOGGER.debug("AEBN: {} elements found", elements.size());
        int i = 1;
        for (Element anchor : elements) {
            String backgroundUrl = anchor.attr("src");
            LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl);
            md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl);
            i++;
        }

        // Runtime
        LOGGER.debug("AEBN: parse runtime");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)");
            element = elements.first();
            String movieRuntime = cleanString(element.attr("content"));
            movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M");
            LOGGER.debug("AEBN: runtime({})", movieRuntime);
            md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime);
        }

        // Year
        LOGGER.debug("AEBN: parse year");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)");
            element = elements.first();
            String movieYear = cleanString(element.attr("content"));
            movieYear = StrgUtils.substr(movieYear, "(\\d+)-");
            LOGGER.debug("AEBN: year({})", movieYear);
            md.storeMetadata(MediaMetadata.YEAR, movieYear);
        }

        // Series (Collection)
        LOGGER.debug("AEBN: parse collection");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            element = elements.first();
            String movieCollection = cleanString(element.text());

            // Fake a TMDB_SET based on the hash value of the collection name
            int movieCollectionHash = movieCollection.hashCode();

            md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection);
            md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash);
            LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash);
        }

        // Studio
        LOGGER.debug("AEBN: parse studio");
        elements = document.getElementsByAttributeValue("id", "md-details")
                .select("[itemprop=productionCompany]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String movieStudio = cleanString(elements.first().text());
            LOGGER.debug("AEBN: studio({})", movieStudio);
            md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio);
        }

        // Genre
        LOGGER.debug("AEBN: parse genre");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]");
        for (Element g : elements) {
            md.addGenre(getTmmGenre(g.text()));
        }
        // add basic genre, since all genres at AEBN could be summarised
        // into this one
        md.addGenre(MediaGenres.EROTIC);

        // Certification
        // no data scrapeable---but obviously it's adult only, so simply
        // generate it
        String movieCertification = null;
        Certification certification = null;
        String country = options.getCountry().getAlpha2();
        LOGGER.debug("AEBN: generate certification for {}", country);
        // @formatter:off
        if (country.equals("DE")) {
            movieCertification = "FSK 18";
        }
        if (country.equals("US")) {
            movieCertification = "NC-17";
        }
        if (country.equals("GB")) {
            movieCertification = "R18";
        }
        if (country.equals("FR")) {
            movieCertification = "18";
        }
        if (country.equals("ES")) {
            movieCertification = "PX";
        }
        if (country.equals("JP")) {
            movieCertification = "R18+";
        }
        if (country.equals("IT")) {
            movieCertification = "V.M.18";
        }
        if (country.equals("NL")) {
            movieCertification = "16";
        }
        // @formatter:on
        certification = Certification.getCertification(options.getCountry(), movieCertification);
        if (certification != null) {
            LOGGER.debug("AEBN: certification({})", certification);
            md.addCertification(certification);
        }

        // Plot and Tagline
        LOGGER.debug("AEBN: parse plot");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String moviePlot = cleanString(elements.first().text());
            md.storeMetadata(MediaMetadata.PLOT, moviePlot);
            // no separate tagline available, so extract the first sentence
            // from the movie plot
            String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])");
            LOGGER.debug("AEBN: tagline(" + movieTagline + ")");
            md.storeMetadata(MediaMetadata.TAGLINE, movieTagline);
        }

        // Actors
        LOGGER.debug("AEBN: parse actors");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]");
        LOGGER.debug("AEBN: {} actors found", elements.size());
        for (Element anchor : elements) {
            String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)");
            String actorname = cleanString(anchor.select("[itemprop=name]").first().text());
            String actordetailsurl = BASE_DATAURL + anchor.attr("href");
            if (!actorname.isEmpty()) {
                LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname,
                        actordetailsurl);
                MediaCastMember cm = new MediaCastMember();
                cm.setType(MediaCastMember.CastType.ACTOR);
                cm.setName(actorname);
                if (!actorid.isEmpty()) {
                    cm.setId(actorid);
                }

                // Actor detail page
                try {
                    Url starurl = new Url(actordetailsurl);
                    InputStream starurlstream = starurl.getInputStream();
                    Document stardocument = Jsoup.parse(starurlstream, "UTF-8", "");
                    starurlstream.close();
                    Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo");
                    if (elements2.size() == 0) {
                        LOGGER.debug("AEBN: no additional actor details found");
                    } else {
                        // Actor image
                        String actorimage = elements2.select("[itemprop=image]").first().attr("src");
                        LOGGER.debug("AEBN: actor image({})", actorimage);
                        if (!actorimage.isEmpty()) {
                            cm.setImageUrl(actorimage);
                        }
                        // Actor 'fanart' images
                        // unsure if this is ever shown in tmm
                        elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery")
                                .select("a");
                        LOGGER.debug("AEBN: {} gallery images found", elements2.size());
                        for (Element thumbnail : elements2) {
                            LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href"));
                            cm.addFanart(thumbnail.attr("href"));
                        }
                    }
                } catch (Exception e) {
                    LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e);
                }

                md.addCastMember(cm);
            }
        }

        // Director
        LOGGER.debug("AEBN: parse director");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)");
            String directorname = cleanString(elements.select("[itemprop=name]").first().text());
            if (!directorname.isEmpty()) {
                MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR);
                cm.setName(directorname);
                if (!directorid.isEmpty()) {
                    cm.setId(directorid);
                }
                cm.setImageUrl("");
                md.addCastMember(cm);
                LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname);
            }
        }

        // Original Title
        // if we have no original title, just copy the title
        if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
        }
    } catch (Exception e) {
        LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e);
    }

    return md;
}

From source file:poe.trade.assist.UniquesListSearchGenerator.java

/**imgurl, reqLvl, base, mod
 * @param args// w w  w.  j  a v a 2 s  .c  o m
 * @throws Exception 
 */
public static void main(String[] args) throws Exception {
    List<String> outputLines = new LinkedList<>();
    outputLines.add(
            "Name   Art   Req.Level   Base   Mods   TaslismanSC   TalismanHC   Standard   Hardcore   poewiki");
    for (String list : lists) {
        HttpResponse<String> response = Unirest.get("http://pathofexile.gamepedia.com/" + list)
                .header("User-Agent",
                        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0")
                .asString();
        Document doc = Jsoup.parse(response.getBody());
        Elements elems = doc.select("table.wikitable.sortable");
        for (Element table : elems) {
            Elements rows = table.select("tr");
            int ctr = 0;
            boolean hasRequiredLevel = false;
            for (Element row : rows) {
                if (ctr == 0) { // first row is headers
                    hasRequiredLevel = !row.select("abbr[title=\"Required Level\"]").isEmpty();
                    ctr++;
                    continue;
                }
                String name = row.child(0).child(0).attr("title");
                System.out.println("Now processing: " + name);
                String imgurl = "=IMAGE(\"" + row.select("img").attr("src") + "\", 3)";
                String base = row.child(1).child(0).attr("title");
                String reqLvl = hasRequiredLevel ? row.child(2).text() : "0";
                reqLvl = reqLvl.equalsIgnoreCase("n/a") ? "0" : reqLvl;
                String mod = "=\"";
                Elements mods = row.select("span.itemboxstatsgroup.text-mod");
                if (!mods.isEmpty()) {
                    if (mods.size() > 2)
                        throw new Exception("mods.size() is > 2. " + name + " - " + mods.toString());
                    boolean hasImplicit = mods.size() > 1;
                    String imp = hasImplicit ? mods.get(0).text() : "";
                    int expIdx = hasImplicit ? 1 : 0;
                    String lineSeparator = "\"&CHAR(10)&\"";
                    String exp = mods.get(expIdx).textNodes().stream().map(n -> n.text().trim())
                            .filter(s -> !s.isEmpty()).collect(Collectors.joining(lineSeparator));
                    String additionalExp = mods.get(expIdx).children().stream().filter(e -> e.hasText())
                            .map(e -> e.text().trim()).collect(Collectors.joining(lineSeparator));
                    if (additionalExp != null && !additionalExp.isEmpty())
                        exp += lineSeparator + additionalExp;
                    mod += imp;
                    if (hasImplicit)
                        mod += (lineSeparator + "--------------" + lineSeparator);
                    mod += exp;
                }
                mod += "\"";

                String standard = "Standard";
                String hardcore = "Hardcore";
                String tempsc = "Talisman";
                String temphc = "Talisman+Hardcore";
                String nameenc = URLEncoder.encode(name, "UTF-8");
                String sc = hyperlink(getSearchURL(standard, nameenc));
                String hc = hyperlink(getSearchURL(hardcore, nameenc));
                String tsc = hyperlink(getSearchURL(tempsc, nameenc));
                String thc = hyperlink(getSearchURL(temphc, nameenc));
                String poewikiurl = hyperlink("http://pathofexile.gamepedia.com/" + (name.replace(' ', '_')));

                String s = format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", name, imgurl, reqLvl, base, mod,
                        tsc, thc, sc, hc, poewikiurl);
                outputLines.add(s);
                Thread.sleep(1000);
            }
        }
    }
    FileUtils.writeLines(new File("uniqueslist.txt"), outputLines);
}

From source file:ru.dmitry.mamishev.URLParse.HtmlString.java

public GazInfo getInfoBill() {
    String html = this.htmlString;
    Document doc = Jsoup.parse(html);
    Elements ul = doc.getElementsByTag("ul");
    String numBill = "";
    numBill = ul.eq(1).text();//w ww .  j a  v a2s  .  co m
    Document bElements = Jsoup.parseBodyFragment(ul.toString());
    Elements b = bElements.getElementsByTag("b");
    GazInfo billInfo = null;
    String date = "";
    String pay = "";
    if (b.size() > 2) {
        String[] ss = SPLIT.split(b.get(2).text());
        if (ss.length > 0) {
            date = ss[0];
            pay = ss[1];
        }
        billInfo = new GazInfo(b.get(0).text(), b.get(1).text(), date, pay, numBill);
    } else {
        billInfo = new GazInfo("", "", date, pay, "");
    }
    return billInfo;

}