Example usage for org.jsoup.nodes Element ownText

List of usage examples for org.jsoup.nodes Element ownText

Introduction

In this page you can find the example usage for org.jsoup.nodes Element ownText.

Prototype

public String ownText() 

Source Link

Document

Gets the text owned by this element only; does not get the combined text of all children.

Usage

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

MediaMetadata getMovieMetadata(MediaScrapeOptions options) throws Exception {
    MediaMetadata md = new MediaMetadata(providerInfo.getId());

    // check if there is a md in the result
    if (options.getResult() != null && options.getResult().getMediaMetadata() != null) {
        LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult());
        return options.getResult().getMediaMetadata();
    }/*w ww  .  j  a  v a 2s . c o m*/

    String imdbId = "";

    // imdbId from searchResult
    if (options.getResult() != null) {
        imdbId = options.getResult().getIMDBId();
    }

    // imdbid from scraper option
    if (!MetadataUtil.isValidImdbId(imdbId)) {
        imdbId = options.getImdbId();
    }

    if (!MetadataUtil.isValidImdbId(imdbId)) {
        return md;
    }

    LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId);
    md.setId(providerInfo.getId(), imdbId);

    ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<>(executor);
    ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<>(executor);

    // worker for imdb request (/reference) (everytime from www.imdb.com)
    // StringBuilder sb = new StringBuilder(imdbSite.getSite());
    StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/reference");
    Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    Future<Document> futureReference = compSvcImdb.submit(worker);

    // worker for imdb request (/plotsummary) (from chosen site)
    Future<Document> futurePlotsummary;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/plotsummary");

    worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    futurePlotsummary = compSvcImdb.submit(worker);

    // worker for tmdb request
    Future<MediaMetadata> futureTmdb = null;
    if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")) {
        Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry());
        futureTmdb = compSvcTmdb.submit(worker2);
    }

    Document doc;
    doc = futureReference.get();
    parseReferencePage(doc, options, md);

    /*
     * plot from /plotsummary
     */
    // build the url
    doc = futurePlotsummary.get();
    parsePlotsummaryPage(doc, options, md);

    // title also from chosen site if we are not scraping akas.imdb.com
    if (imdbSite != ImdbSiteDefinition.IMDB_COM) {
        Element title = doc.getElementById("tn15title");
        if (title != null) {
            Element element;
            // title
            Elements elements = title.getElementsByClass("main");
            if (elements.size() > 0) {
                element = elements.first();
                String movieTitle = cleanString(element.ownText());
                md.setTitle(movieTitle);
            }
        }
    }

    // get the release info page
    Future<Document> futureReleaseinfo;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/releaseinfo");
    worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    futureReleaseinfo = compSvcImdb.submit(worker);
    doc = futureReleaseinfo.get();
    // parse original title here!!
    parseReleaseinfoPageAKAs(doc, options, md);

    // did we get a release date?
    if (md.getReleaseDate() == null
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("localReleaseDate")) {
        parseReleaseinfoPage(doc, options, md);
    }

    // get data from tmdb?
    if (futureTmdb != null && (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo"))) {
        try {
            MediaMetadata tmdbMd = futureTmdb.get();
            if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") && tmdbMd != null) {
                // tmdbid
                md.setId(MediaMetadata.TMDB, tmdbMd.getId(MediaMetadata.TMDB));
                // title
                if (StringUtils.isNotBlank(tmdbMd.getTitle())) {
                    md.setTitle(tmdbMd.getTitle());
                }
                // original title
                if (StringUtils.isNotBlank(tmdbMd.getOriginalTitle())) {
                    md.setOriginalTitle(tmdbMd.getOriginalTitle());
                }
                // tagline
                if (StringUtils.isNotBlank(tmdbMd.getTagline())) {
                    md.setTagline(tmdbMd.getTagline());
                }
                // plot
                if (StringUtils.isNotBlank(tmdbMd.getPlot())) {
                    md.setPlot(tmdbMd.getPlot());
                }
                // collection info
                if (StringUtils.isNotBlank(tmdbMd.getCollectionName())) {
                    md.setCollectionName(tmdbMd.getCollectionName());
                    md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET));
                }
            }
            if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")
                    && tmdbMd != null) {
                md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET));
                md.setCollectionName(tmdbMd.getCollectionName());
            }
            md.setId(tmdbMd.getProviderId(), tmdbMd.getId(tmdbMd.getProviderId()));
        } catch (Exception ignored) {
        }
    }

    // if we have still no original title, take the title
    if (StringUtils.isBlank(md.getOriginalTitle())) {
        md.setOriginalTitle(md.getTitle());
    }

    // populate id
    md.setId(ImdbMetadataProvider.providerInfo.getId(), imdbId);

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    /*//w w w.j  a v a2  s  . c o m
     * title and year have the following structure
     * 
     * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span
     * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div>
     */

    // title
    Element title = doc.getElementsByAttributeValue("name", "title").first();
    if (title != null) {
        String movieTitle = cleanString(title.attr("content"));
        int yearStart = movieTitle.lastIndexOf("(");
        if (yearStart > 0) {
            movieTitle = movieTitle.substring(0, yearStart - 1).trim();
            md.setTitle(movieTitle);
        }
    }

    // original title and year
    Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first();
    if (originalTitleYear != null) {
        String content = originalTitleYear.attr("content");
        int startOfYear = content.lastIndexOf("(");
        if (startOfYear > 0) {
            // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page...
            // String originalTitle = content.substring(0, startOfYear - 1).trim();
            // md.setOriginalTitle(originalTitle);

            String yearText = content.substring(startOfYear);

            // search year
            Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}");
            Matcher matcher = yearPattern.matcher(yearText);
            while (matcher.find()) {
                if (matcher.group(0) != null) {
                    String movieYear = matcher.group(0);
                    try {
                        md.setYear(Integer.parseInt(movieYear));
                        break;
                    } catch (Exception ignored) {
                    }
                }
            }
        }
    }

    // poster
    Element poster = doc.getElementsByAttributeValue("property", "og:image").first();
    if (poster != null) {
        String posterUrl = poster.attr("content");

        int fileStart = posterUrl.lastIndexOf("/");
        if (fileStart > 0) {
            int parameterStart = posterUrl.indexOf("_", fileStart);
            if (parameterStart > 0) {
                int startOfExtension = posterUrl.lastIndexOf(".");
                if (startOfExtension > parameterStart) {
                    posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension);

                }
            }
        }
        processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl);
    }

    /*
     * <div class="starbar-meta"> <b>7.4/10</b> &nbsp;&nbsp;<a href="ratings" class="tn15more">52,871 votes</a>&nbsp;&raquo; </div>
     */

    // rating and rating count
    Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first();
    if (ratingElement != null) {
        String ratingAsString = ratingElement.ownText().replace(",", ".");
        try {
            md.setRating(Float.valueOf(ratingAsString));
        } catch (Exception ignored) {
        }

        Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first();
        if (votesElement != null) {
            String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim();
            try {
                md.setVoteCount(Integer.parseInt(countAsString));
            } catch (Exception ignored) {
            }
        }
    }
    // top250
    Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first();
    if (topRatedElement != null) {
        Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})");
        Matcher matcher = topPattern.matcher(topRatedElement.ownText());
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                try {
                    String top250Text = matcher.group(1);
                    md.setTop250(Integer.parseInt(top250Text));
                } catch (Exception ignored) {
                }
            }
        }
    }

    // releasedate
    Element releaseDateElement = doc
            .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo")
            .first();
    if (releaseDateElement != null) {
        String releaseDateText = releaseDateElement.ownText();
        int startOfCountry = releaseDateText.indexOf("(");
        if (startOfCountry > 0) {
            releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim();
        }
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
            Date parsedDate = sdf.parse(releaseDateText);
            md.setReleaseDate(parsedDate);
        } catch (ParseException otherformat) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                Date parsedDate = sdf.parse(releaseDateText);
                md.setReleaseDate(parsedDate);
            } catch (ParseException ignored) {
            }
        }
    }

    Elements elements = doc.getElementsByClass("ipl-zebra-list__label");
    for (Element element : elements) {
        // only parse tds
        if (!"td".equals(element.tag().getName())) {
            continue;
        }

        String elementText = element.ownText();

        if (elementText.equals("Taglines")) {
            if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) {
                Element taglineElement = element.nextElementSibling();
                if (taglineElement != null) {
                    String tagline = cleanString(taglineElement.ownText().replaceAll("", ""));
                    md.setTagline(tagline);
                }
            }
        }

        if (elementText.equals("Genres")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/");

                for (Element genreElement : genreElements) {
                    String genreText = genreElement.ownText();
                    md.addGenre(getTmmGenre(genreText));
                }
            }
        }

        /*
         * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition)
         * | 178 min (extended cut)</div></div>
         */
        if (elementText.equals("Runtime")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first();
                if (runtimeElement != null) {
                    String first = runtimeElement.ownText().split("\\|")[0];
                    String runtimeAsString = cleanString(first.replaceAll("min", ""));
                    int runtime = 0;
                    try {
                        runtime = Integer.parseInt(runtimeAsString);
                    } catch (Exception e) {
                        // try to filter out the first number we find
                        Pattern runtimePattern = Pattern.compile("([0-9]{2,3})");
                        Matcher matcher = runtimePattern.matcher(runtimeAsString);
                        if (matcher.find()) {
                            runtime = Integer.parseInt(matcher.group(0));
                        }
                    }
                    md.setRuntime(runtime);
                }
            }
        }

        if (elementText.equals("Country")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/");
                Pattern pattern = Pattern.compile("/country/(.*)");

                for (Element countryElement : countryElements) {
                    Matcher matcher = pattern.matcher(countryElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addCountry(LanguageUtils.getLocalizedCountryForLanguage(
                                    options.getLanguage().getLanguage(), countryElement.text(),
                                    matcher.group(1)));
                        } else {
                            md.addCountry(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Language")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements languageElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/language/");
                Pattern pattern = Pattern.compile("/language/(.*)");

                for (Element languageElement : languageElements) {
                    Matcher matcher = pattern.matcher(languageElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString(
                                    options.getLanguage(), languageElement.text(), matcher.group(1)));
                        } else {
                            md.addSpokenLanguage(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Certification")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                String languageCode = options.getCountry().getAlpha2();
                Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/search/title?certificates=" + languageCode);
                boolean done = false;
                for (Element certificationElement : certificationElements) {
                    String certText = certificationElement.ownText();
                    int startOfCert = certText.indexOf(":");
                    if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                        certText = certText.substring(startOfCert + 1);
                    }

                    Certification certification = Certification.getCertification(options.getCountry(),
                            certText);
                    if (certification != null) {
                        md.addCertification(certification);
                        done = true;
                        break;
                    }
                }

                if (!done && languageCode.equals("DE")) {
                    certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                            "/search/title?certificates=XWG");
                    for (Element certificationElement : certificationElements) {
                        String certText = certificationElement.ownText();
                        int startOfCert = certText.indexOf(":");
                        if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                            certText = certText.substring(startOfCert + 1);
                        }

                        Certification certification = Certification.getCertification(options.getCountry(),
                                certText);
                        if (certification != null) {
                            md.addCertification(certification);
                            break;
                        }
                    }
                }

            }
        }
    }

    // director
    Element directorsElement = doc.getElementById("directors");
    while (directorsElement != null && directorsElement.tag().getName() != "header") {
        directorsElement = directorsElement.parent();
    }
    if (directorsElement != null) {
        directorsElement = directorsElement.nextElementSibling();
    }
    if (directorsElement != null) {
        for (Element directorElement : directorsElement.getElementsByClass("name")) {
            String director = directorElement.text().trim();

            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR);
            cm.setName(director);
            md.addCastMember(cm);
        }
    }

    // actors
    Element castTableElement = doc.getElementsByClass("cast_list").first();
    if (castTableElement != null) {
        Elements tr = castTableElement.getElementsByTag("tr");
        for (Element row : tr) {
            MediaCastMember cm = parseCastMember(row);
            if (cm != null && StringUtils.isNotEmpty(cm.getName())
                    && StringUtils.isNotEmpty(cm.getCharacter())) {
                cm.setType(MediaCastMember.CastType.ACTOR);
                md.addCastMember(cm);
            }
        }
    }

    // writers
    Element writersElement = doc.getElementById("writers");
    while (writersElement != null && writersElement.tag().getName() != "header") {
        writersElement = writersElement.parent();
    }
    if (writersElement != null) {
        writersElement = writersElement.nextElementSibling();
    }
    if (writersElement != null) {
        Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element writerElement : writersElements) {
            String writer = cleanString(writerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER);
            cm.setName(writer);
            md.addCastMember(cm);
        }
    }

    // producers
    Element producersElement = doc.getElementById("producers");
    while (producersElement != null && producersElement.tag().getName() != "header") {
        producersElement = producersElement.parent();
    }
    if (producersElement != null) {
        producersElement = producersElement.nextElementSibling();
    }
    if (producersElement != null) {
        Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element producerElement : producersElements) {
            String producer = cleanString(producerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER);
            cm.setName(producer);
            md.addCastMember(cm);
        }
    }

    // producers
    Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title");
    Element prodCompHeaderElement = null;

    for (Element possibleProdCompHeaderEl : prodCompHeaderElements) {
        if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) {
            prodCompHeaderElement = possibleProdCompHeaderEl;
            break;
        }
    }

    while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") {
        prodCompHeaderElement = prodCompHeaderElement.parent();
    }
    if (prodCompHeaderElement != null) {
        prodCompHeaderElement = prodCompHeaderElement.nextElementSibling();
    }
    if (prodCompHeaderElement != null) {
        Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href",
                "/company/");

        for (Element prodCompElement : prodCompElements) {
            String prodComp = prodCompElement.ownText();
            md.addProductionCompany(prodComp);
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaMetadata parsePlotsummaryPage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    // imdb.com has another site structure
    if (getImdbSite() == ImdbSiteDefinition.IMDB_COM) {

        // first check synopsis content
        // Element zebraList = doc.getElementById("plot-synopsis-content");
        // if (zebraList != null) {
        // Elements p = zebraList.getElementsByClass("ipl-zebra-list__item");
        // if (!p.isEmpty()) {
        // Element em = p.get(0);
        // if (!"no-synopsis-content".equals(em.id())) {
        // String plot = cleanString(em.text());
        // md.setPlot(plot);
        // }/*  w w  w  .  j a v  a 2 s.  c o m*/
        // }
        // }
        // NOPE: synopsis contains spoilers

        // just take first summary
        // <li class="ipl-zebra-list__item" id="summary-ps21700000">
        // <p>text text text text </p>
        // <div class="author-container">
        // <em>&mdash;<a href="/search/title?plot_author=author">Author Name</a></em>
        // </div>
        // </li>
        Element zebraList = doc.getElementById("plot-summaries-content");
        if (zebraList != null) {
            Elements p = zebraList.getElementsByClass("ipl-zebra-list__item");
            if (!p.isEmpty()) {
                Element em = p.get(0);

                // remove author
                Elements authors = em.getElementsByClass("author-container");
                if (!authors.isEmpty()) {
                    authors.get(0).remove();
                }

                if (!"no-summary-content".equals(em.id())) {
                    String plot = cleanString(em.text());
                    md.setPlot(plot);
                }
            }
        }

    } else {
        Element wiki = doc.getElementById("swiki.2.1");
        if (wiki != null) {
            String plot = cleanString(wiki.ownText());
            md.setPlot(plot);
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaCastMember parseCastMember(Element row) {

    Element nameElement = row.getElementsByAttributeValueStarting("itemprop", "name").first();
    if (nameElement == null) {
        return null;
    }/*from w  w  w.  j a  va 2  s.  c  o  m*/
    String name = cleanString(nameElement.ownText());
    String characterName = "";

    Element characterElement = row.getElementsByClass("character").first();
    if (characterElement != null) {
        characterName = cleanString(characterElement.text());
        // and now strip off trailing commentaries like - (120 episodes,
        // 2006-2014)
        characterName = characterName.replaceAll("\\(.*?\\)$", "").trim();
    }

    String image = "";
    Element imageElement = row.getElementsByTag("img").first();
    if (imageElement != null) {
        String imageSrc = imageElement.attr("loadlate");

        if (!StringUtils.isEmpty(imageSrc)) {
            int fileStart = imageSrc.lastIndexOf("/");
            if (fileStart > 0) {
                // parse out the rescale/crop params
                int parameterStart = imageSrc.indexOf("._", fileStart);
                if (parameterStart > 0) {
                    int startOfExtension = imageSrc.lastIndexOf(".");
                    if (startOfExtension > parameterStart) {
                        // rebuild the path - scaled to 632 px height as in tmdb scraper
                        imageSrc = imageSrc.substring(0, parameterStart) + "._UY632"
                                + imageSrc.substring(startOfExtension);
                    }
                }
            }
            image = imageSrc;
        }
    }

    MediaCastMember cm = new MediaCastMember();
    cm.setCharacter(characterName);
    cm.setName(name);
    cm.setImageUrl(image);
    return cm;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java

private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) {
    Pattern unknownPattern = Pattern.compile("Unknown");
    Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)");
    int episodeCounter = 0;

    // parse episodes
    Elements tables = doc.getElementsByClass("eplist");
    for (Element table : tables) {
        Elements rows = table.getElementsByClass("list_item");
        for (Element row : rows) {
            Matcher matcher = season == 0 ? unknownPattern.matcher(row.text())
                    : seasonEpisodePattern.matcher(row.text());
            if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) {
                try {
                    // we found a row containing episode data
                    MediaEpisode ep = new MediaEpisode(providerInfo.getId());

                    // parse season and ep number
                    if (season == 0) {
                        ep.season = season;
                        ep.episode = ++episodeCounter;
                    } else {
                        ep.season = Integer.parseInt(matcher.group(1));
                        ep.episode = Integer.parseInt(matcher.group(2));
                    }//from   w w  w  .ja v  a 2 s  .  co m

                    // check if we have still valid data
                    if (season > 0 && season != ep.season) {
                        return false;
                    }

                    // get ep title and id
                    Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt");
                    for (Element anchor : anchors) {
                        if ("name".equals(anchor.attr("itemprop"))) {
                            ep.title = anchor.text();
                            break;
                        }
                    }

                    String id = "";
                    Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href"));
                    while (idMatcher.find()) {
                        if (idMatcher.group(1) != null) {
                            id = idMatcher.group(1);
                        }
                    }

                    if (StringUtils.isNotBlank(id)) {
                        ep.ids.put(providerInfo.getId(), id);
                    }

                    // plot
                    Element plot = row.getElementsByClass("item_description").first();
                    if (plot != null) {
                        ep.plot = plot.ownText();
                    }

                    // rating and rating count
                    Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first();
                    if (ratingElement != null) {
                        String ratingAsString = ratingElement.ownText().replace(",", ".");
                        try {
                            ep.rating = Float.valueOf(ratingAsString);
                        } catch (Exception ignored) {
                        }

                        Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first();
                        if (votesElement != null) {
                            String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim();
                            try {
                                ep.voteCount = Integer.parseInt(countAsString);
                            } catch (Exception ignored) {
                            }
                        }
                    }

                    // release date
                    Element releaseDate = row.getElementsByClass("airdate").first();
                    if (releaseDate != null) {
                        ep.firstAired = releaseDate.ownText();
                    }

                    // poster
                    Element image = row.getElementsByTag("img").first();
                    if (image != null) {
                        String posterUrl = image.attr("src");
                        posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "");
                        posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "");
                        posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");

                        if (StringUtils.isNotBlank(posterUrl)) {
                            MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(),
                                    MediaArtwork.MediaArtworkType.THUMB);
                            ma.setPreviewUrl(posterUrl);
                            ma.setDefaultUrl(posterUrl);
                            ep.artwork.add(ma);
                        }
                    }

                    episodes.add(ep);
                } catch (Exception e) {
                    LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage());
                }
            }
        }
    }
    return true;
}

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception {
    LOGGER.debug("search() " + options.toString());
    List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>();
    String searchUrl = "";
    String searchTerm = "";
    String imdb = "";

    // only title search
    if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search for everything: " + searchTerm);
    } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search with title: " + searchTerm);
    } else {/*from w  w  w  .j a  v a2 s.c o  m*/
        LOGGER.debug("empty searchString");
        return resultList;
    }

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    Document doc = null;
    try {
        Url url = new CachedUrl(searchUrl);
        InputStream in = url.getInputStream();
        doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();
    } catch (Exception e) {
        LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage());

        // clear cache
        CachedUrl.removeCachedFileForUrl(searchUrl);
    }

    if (doc == null) {
        return resultList;
    }

    // only look for movie links
    Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php");
    LOGGER.debug("found " + filme.size() + " search results");
    if (filme.isEmpty()) {
        if (!doc.getElementsByTag("title").text().contains("Suche nach")) {
            // redirected to detail page
            MediaSearchResult msr = new MediaSearchResult(providerInfo.getId());
            Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id=");
            if (el.size() > 0) {
                msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)"));
            }
            msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim());
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                msr.setYear(el.get(0).text());
            }
            resultList.add(msr);
        }
        return resultList;
    }

    // <a
    // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1"
    // class="normLight">Avatar - Aufbruch nach Pandora</B>
    // <nobr>(2009)</nobr><br /><span class="smallLight"
    // style="color:#ccc;">Avatar</span></a>

    // map to merge 2 results :/
    Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>();

    for (Element a : filme) {
        try {
            String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-");
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            if (res.containsKey(id)) {
                LOGGER.debug("dupe found; merging with previous searchresult");
                sr = res.get(id);
            }

            if (StringUtils.isNotEmpty(imdb)) {
                sr.setIMDBId(imdb);
            }
            if (StringUtils.isEmpty(sr.getId())) {
                sr.setId(id);
            }
            if (StringUtils.isEmpty(sr.getTitle())) {
                if (a.html().contains("nobr")) {
                    sr.setTitle(a.ownText());
                } else {
                    sr.setTitle(a.text());
                }
            }
            LOGGER.debug("found movie " + sr.getTitle());
            if (StringUtils.isEmpty(sr.getOriginalTitle())) {
                sr.setOriginalTitle(a.getElementsByTag("span").text());
            }
            if (StringUtils.isEmpty(sr.getYear())) {
                sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any
                                                                                                 // 4
                                                                                                 // digit
            }
            sr.setMediaType(MediaType.MOVIE);
            sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id);
            // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(),
            // "images(.*?)\\&quot"));

            if (imdb.equals(sr.getIMDBId())) {
                // perfect match
                sr.setScore(1);
            } else {
                // compare score based on names
                sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle()));
            }

            // populate extra args
            MetadataUtil.copySearchQueryToSearchResult(options, sr);
            res.put(id, sr);
        } catch (Exception e) {
            LOGGER.warn("error parsing movie result: " + e.getMessage());
        }
    }
    for (String r : res.keySet()) {
        resultList.add(res.get(r));
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);
    return resultList;
}

From source file:org.wandora.application.tools.extractors.bookmark.BookmarkExtractor.java

private void parseCategory(Element c, Topic parent, TopicMap t) throws TopicMapException, ParseException {

    Topic cTopic = parent;//from   ww  w.j  av a 2 s  . c o  m
    Elements children = c.children();
    for (Element child : children) {
        if (child.tagName().equals("h3")) {
            String cLocator = parent.getSubjectLocator().toString();
            cLocator += "/" + urlEncode(child.html());
            String cName = child.ownText();

            cTopic = getOrCreateTopic(t, cLocator);
            cTopic.setSubjectLocator(new Locator(cLocator));
            cTopic.setBaseName(cName + " (Bookmark)");
            cTopic.setDisplayName(LANG, cName);
            makeSubclassOf(t, cTopic, parent);
        }
    }

    for (Element child : children) {
        if (!child.tagName().equals("dl"))
            continue;

        for (Element grandChild : child.children()) {
            if (!grandChild.tagName().equals("dt"))
                continue;
            for (Element ggChild : grandChild.children()) {
                if (ggChild.tagName().equals("a"))
                    parseItem(ggChild, cTopic, t);
                else if (ggChild.tagName().equals("dl"))
                    parseCategory(grandChild, cTopic, t);
            }

        }

    }
}

From source file:org.wandora.application.tools.extractors.bookmark.BookmarkExtractor.java

private void parseItem(Element i, Topic parent, TopicMap t) throws TopicMapException, ParseException {
    String cLocator = i.attr("href");
    Topic iTopic = getOrCreateTopic(t, cLocator);
    iTopic.setSubjectLocator(new Locator(cLocator));
    iTopic.setBaseName(i.ownText() + " (Bookmark)");
    iTopic.setDisplayName(LANG, i.ownText());
    iTopic.addType(parent);//  w ww .  j  av a2  s . co  m

    String attr;
    Topic type;
    for (int j = 0; j < itemProps.length; j++) {
        type = getOrCreateTopic(t, itemProps[j][0], itemProps[j][2]);
        attr = i.attr(itemProps[j][1]);
        if (attr.length() > 0)
            iTopic.setData(type, langTopic, attr);
    }
    long timeStamp;
    for (int j = 0; j < itemTimeProps.length; j++) {
        type = getOrCreateTopic(t, itemTimeProps[j][0], itemTimeProps[j][2]);
        attr = i.attr(itemTimeProps[j][1]);

        if (attr.length() > 0) {
            timeStamp = Integer.parseInt(attr);
            timeStamp *= 1000;
            iTopic.setData(type, langTopic, df.format(new Date(timeStamp)));
        }
    }

}

From source file:uk.co.certait.htmlexporter.writer.AbstractTableCellWriter.java

/**
 * Returns the actual text of the innermost child element for this cell.
 * //from w  w w. j a  va 2  s  . com
 * @param element
 * 
 * @return The text to be output for this Cell.
 */
public String getElementText(Element element) {
    String text = element.ownText();

    for (Element child : element.children()) {
        text = child.ownText();
    }

    return text;
}

From source file:uk.co.certait.htmlexporter.writer.AbstractTableCellWriter.java

/**
 * //  w  w  w.  j a v a  2 s  . c om
 * @param element
 * 
 * @return
 */
public Double getNumericValue(Element element) {
    Double numericValue = null;

    if (!element.hasAttr(DATA_TEXT_CELL))
        try {
            numericValue = NumberFormat.getInstance().parse(element.ownText()).doubleValue();
        } catch (ParseException e) {

        }

    return numericValue;
}