List of usage examples for org.jsoup.nodes Element getElementsByClass
public Elements getElementsByClass(String className)
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
MediaMetadata getMovieMetadata(MediaScrapeOptions options) throws Exception { MediaMetadata md = new MediaMetadata(providerInfo.getId()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMediaMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMediaMetadata(); }//ww w. j a v a 2 s . c o m String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(providerInfo.getId(), imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<>(executor); // worker for imdb request (/reference) (everytime from www.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/reference"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); Future<Document> futureReference = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureReference.get(); parseReferencePage(doc, options, md); /* * plot from /plotsummary */ // build the url doc = futurePlotsummary.get(); parsePlotsummaryPage(doc, options, md); // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { Element title = doc.getElementById("tn15title"); if (title != null) { Element element; // title Elements elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.setTitle(movieTitle); } } } // get the release info page Future<Document> futureReleaseinfo; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/releaseinfo"); worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); futureReleaseinfo = compSvcImdb.submit(worker); doc = futureReleaseinfo.get(); // parse original title here!! parseReleaseinfoPageAKAs(doc, options, md); // did we get a release date? if (md.getReleaseDate() == null || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("localReleaseDate")) { parseReleaseinfoPage(doc, options, md); } // get data from tmdb? if (futureTmdb != null && (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo"))) { try { MediaMetadata tmdbMd = futureTmdb.get(); if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") && tmdbMd != null) { // tmdbid md.setId(MediaMetadata.TMDB, tmdbMd.getId(MediaMetadata.TMDB)); // title if (StringUtils.isNotBlank(tmdbMd.getTitle())) { md.setTitle(tmdbMd.getTitle()); } // original title if (StringUtils.isNotBlank(tmdbMd.getOriginalTitle())) { md.setOriginalTitle(tmdbMd.getOriginalTitle()); } // tagline if (StringUtils.isNotBlank(tmdbMd.getTagline())) { md.setTagline(tmdbMd.getTagline()); } // plot if (StringUtils.isNotBlank(tmdbMd.getPlot())) { md.setPlot(tmdbMd.getPlot()); } // collection info if (StringUtils.isNotBlank(tmdbMd.getCollectionName())) { md.setCollectionName(tmdbMd.getCollectionName()); md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET)); } } if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo") && tmdbMd != null) { md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET)); md.setCollectionName(tmdbMd.getCollectionName()); } md.setId(tmdbMd.getProviderId(), tmdbMd.getId(tmdbMd.getProviderId())); } catch (Exception ignored) { } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getOriginalTitle())) { md.setOriginalTitle(md.getTitle()); } // populate id md.setId(ImdbMetadataProvider.providerInfo.getId(), imdbId); return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
private MediaMetadata parseReleaseinfoPage(Document doc, MediaScrapeOptions options, MediaMetadata md) { Date releaseDate = null;/* w w w. j a v a 2 s . c om*/ Pattern pattern = Pattern.compile("/calendar/\\?region=(.{2})"); // old way Element tableReleaseDates = doc.getElementById("release_dates"); if (tableReleaseDates != null) { Elements rows = tableReleaseDates.getElementsByTag("tr"); // first round: check the release date for the first one with the requested country for (Element row : rows) { // get the anchor Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first(); if (anchor != null) { Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) { Element column = row.getElementsByClass("release_date").first(); if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException ignored) { } } } } } } } // new way; iterating over class name items if (releaseDate == null) { Elements rows = doc.getElementsByClass("release-date-item"); for (Element row : rows) { Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first(); if (anchor != null) { Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) { Element column = row.getElementsByClass("release-date-item__date").first(); if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException ignored) { } } } } else { LOGGER.trace("country {} does not match ours {}", matcher.group(1), options.getCountry().getAlpha2()); } } } } // no matching local release date found; take the first one if (releaseDate == null) { Element column = doc.getElementsByClass("release_date").first(); if (column == null) { column = doc.getElementsByClass("release-date-item__date").first(); } if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); } catch (ParseException ignored) { } } } } if (releaseDate != null) { md.setReleaseDate(releaseDate); } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
private MediaMetadata parseReleaseinfoPageAKAs(Document doc, MediaScrapeOptions options, MediaMetadata md) { // <table id="akas" class="subpage_data spEven2Col"> // <tr class="even"> // <td>(original title)</td> // <td>Intouchables</td> // </tr> // need to search all tables for correct ID, since the UNIQUE id is used multiple times - thanks for nothing :p for (Element table : doc.getElementsByTag("table")) { if (table.id().equalsIgnoreCase("akas")) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { Element c1 = row.getElementsByTag("td").get(0); Element c2 = row.getElementsByTag("td").get(1); if (c1 != null && c1.text().toLowerCase(Locale.ROOT).contains("original title")) { md.setOriginalTitle(c2.text()); break; }//from w w w . ja va 2 s .c o m } } } // alternative; new way with table classes // <tr class="ipl-zebra-list__item aka-item"> // <td class="aka-item__name">Germany</td> // <td class="aka-item__title">Avatar - Aufbruch nach Pandora</td> // </tr> if (md.getOriginalTitle().isEmpty()) { Elements rows = doc.getElementsByClass("aka-item"); for (Element row : rows) { Element country = row.getElementsByClass("aka-item__name").first(); Element title = row.getElementsByClass("aka-item__title").first(); if (country != null && country.text().toLowerCase(Locale.ROOT).contains("original title")) { md.setOriginalTitle(title.text()); break; } } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
/** * do the search according to the type//from w ww. j av a 2 s . c o m * * @param query * the search params * @return the found results */ protected List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { List<MediaSearchResult> result = new ArrayList<>(); /* * IMDb matches seem to come in several "flavours". * * Firstly, if there is one exact match it returns the matching IMDb page. * * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results) * * We should check the Exact match section first, then the poplar titles and finally the partial matches. * * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek" */ String searchTerm = ""; if (StringUtils.isNotEmpty(query.getImdbId())) { searchTerm = query.getImdbId(); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.getQuery(); } if (StringUtils.isEmpty(searchTerm)) { return result; } // parse out language and coutry from the scraper query String language = query.getLanguage().getLanguage(); int myear = query.getYear(); String country = query.getCountry().getAlpha2(); // for passing the country to the scrape searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); StringBuilder sb = new StringBuilder(getImdbSite().getSite()); sb.append("find?q="); try { // search site was everytime in UTF-8 sb.append(URLEncoder.encode(searchTerm, "UTF-8")); } catch (UnsupportedEncodingException ex) { // Failed to encode the movie name for some reason! getLogger().debug("Failed to encode search term: " + searchTerm); sb.append(searchTerm); } // we need to search for all - otherwise we do not find TV movies sb.append(getSearchCategory()); getLogger().debug("========= BEGIN IMDB Scraper Search for: " + sb.toString()); Document doc; try { Url url = new Url(sb.toString()); url.addHeader("Accept-Language", getAcceptLanguage(language, country)); doc = Jsoup.parse(url.getInputStream(), "UTF-8", ""); } catch (Exception e) { getLogger().debug("tried to fetch search response", e); return result; } // check if it was directly redirected to the site Elements elements = doc.getElementsByAttributeValue("rel", "canonical"); for (Element element : elements) { MediaMetadata md = null; // we have been redirected to the movie site String movieName = null; String movieId = null; String href = element.attr("href"); Matcher matcher = IMDB_ID_PATTERN.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // get full information if (!StringUtils.isEmpty(movieId)) { MediaScrapeOptions options = new MediaScrapeOptions(type); options.setImdbId(movieId); options.setLanguage(query.getLanguage()); options.setCountry(CountryCode.valueOf(country)); md = getMetadata(options); if (!StringUtils.isEmpty(md.getTitle())) { movieName = md.getTitle(); } } // if a movie name/id was found - return it if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) { MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(), query.getMediaType()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(md.getYear()); sr.setMetadata(md); sr.setScore(1); // and parse out the poster String posterUrl = ""; Elements posters = doc.getElementsByClass("poster"); if (posters != null && !posters.isEmpty()) { Elements imgs = posters.get(0).getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_"); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } if (StringUtils.isNotBlank(posterUrl)) { sr.setPosterUrl(posterUrl); } result.add(sr); return result; } } // parse results // elements = doc.getElementsByClass("result_text"); elements = doc.getElementsByClass("findResult"); for (Element tr : elements) { // we only want the tr's if (!"tr".equalsIgnoreCase(tr.tagName())) { continue; } // find the id / name String movieName = ""; String movieId = ""; int year = 0; Elements tds = tr.getElementsByClass("result_text"); for (Element element : tds) { // we only want the td's if (!"td".equalsIgnoreCase(element.tagName())) { continue; } // filter out unwanted results Pattern unwantedSearchResultPattern = getUnwantedSearchResultPattern(); if (unwantedSearchResultPattern != null) { Matcher matcher = unwantedSearchResultPattern.matcher(element.text()); if (matcher.find()) { continue; } } // is there a localized name? (aka) String localizedName = ""; Elements italics = element.getElementsByTag("i"); if (italics.size() > 0) { localizedName = italics.text().replace("\"", ""); } // get the name inside the link Elements anchors = element.getElementsByTag("a"); for (Element a : anchors) { if (StringUtils.isNotEmpty(a.text())) { // movie name if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) { // take AKA as title, but only if not EN movieName = localizedName; } else { movieName = a.text(); } // parse id String href = a.attr("href"); Matcher matcher = IMDB_ID_PATTERN.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // try to parse out the year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); matcher = yearPattern.matcher(element.text()); while (matcher.find()) { if (matcher.group(1) != null) { try { year = Integer.parseInt(matcher.group(1)); break; } catch (Exception ignored) { } } } break; } } } // if an id/name was found - parse the poster image String posterUrl = ""; tds = tr.getElementsByClass("primary_photo"); for (Element element : tds) { Elements imgs = element.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_"); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } // if no movie name/id was found - continue if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) { continue; } MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(), query.getMediaType()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(year); sr.setPosterUrl(posterUrl); if (movieId.equals(query.getImdbId())) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchTerm, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { getLogger().debug("no poster - downgrading score by 0.01"); score = score - 0.01f; } if (yearDiffers(myear, year)) { float diff = (float) Math.abs(year - myear) / 100; getLogger() .debug("parsed year does not match search result year - downgrading score by " + diff); score -= diff; } sr.setScore(score); } result.add(sr); // only get 40 results if (result.size() >= 40) { break; } } Collections.sort(result); Collections.reverse(result); return result; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) { /*//from w ww .j a v a 2 s . c o m * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // title Element title = doc.getElementsByAttributeValue("name", "title").first(); if (title != null) { String movieTitle = cleanString(title.attr("content")); int yearStart = movieTitle.lastIndexOf("("); if (yearStart > 0) { movieTitle = movieTitle.substring(0, yearStart - 1).trim(); md.setTitle(movieTitle); } } // original title and year Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first(); if (originalTitleYear != null) { String content = originalTitleYear.attr("content"); int startOfYear = content.lastIndexOf("("); if (startOfYear > 0) { // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page... // String originalTitle = content.substring(0, startOfYear - 1).trim(); // md.setOriginalTitle(originalTitle); String yearText = content.substring(startOfYear); // search year Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}"); Matcher matcher = yearPattern.matcher(yearText); while (matcher.find()) { if (matcher.group(0) != null) { String movieYear = matcher.group(0); try { md.setYear(Integer.parseInt(movieYear)); break; } catch (Exception ignored) { } } } } } // poster Element poster = doc.getElementsByAttributeValue("property", "og:image").first(); if (poster != null) { String posterUrl = poster.attr("content"); int fileStart = posterUrl.lastIndexOf("/"); if (fileStart > 0) { int parameterStart = posterUrl.indexOf("_", fileStart); if (parameterStart > 0) { int startOfExtension = posterUrl.lastIndexOf("."); if (startOfExtension > parameterStart) { posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension); } } } processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { md.setRating(Float.valueOf(ratingAsString)); } catch (Exception ignored) { } Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim(); try { md.setVoteCount(Integer.parseInt(countAsString)); } catch (Exception ignored) { } } } // top250 Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first(); if (topRatedElement != null) { Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(topRatedElement.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { try { String top250Text = matcher.group(1); md.setTop250(Integer.parseInt(top250Text)); } catch (Exception ignored) { } } } } // releasedate Element releaseDateElement = doc .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo") .first(); if (releaseDateElement != null) { String releaseDateText = releaseDateElement.ownText(); int startOfCountry = releaseDateText.indexOf("("); if (startOfCountry > 0) { releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim(); } try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException ignored) { } } } Elements elements = doc.getElementsByClass("ipl-zebra-list__label"); for (Element element : elements) { // only parse tds if (!"td".equals(element.tag().getName())) { continue; } String elementText = element.ownText(); if (elementText.equals("Taglines")) { if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) { Element taglineElement = element.nextElementSibling(); if (taglineElement != null) { String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.setTagline(tagline); } } } if (elementText.equals("Genres")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/"); for (Element genreElement : genreElements) { String genreText = genreElement.ownText(); md.addGenre(getTmmGenre(genreText)); } } } /* * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) * | 178 min (extended cut)</div></div> */ if (elementText.equals("Runtime")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first(); if (runtimeElement != null) { String first = runtimeElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.setRuntime(runtime); } } } if (elementText.equals("Country")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/"); Pattern pattern = Pattern.compile("/country/(.*)"); for (Element countryElement : countryElements) { Matcher matcher = pattern.matcher(countryElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addCountry(LanguageUtils.getLocalizedCountryForLanguage( options.getLanguage().getLanguage(), countryElement.text(), matcher.group(1))); } else { md.addCountry(matcher.group(1)); } } } } } if (elementText.equals("Language")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements languageElements = nextElement.getElementsByAttributeValueStarting("href", "/language/"); Pattern pattern = Pattern.compile("/language/(.*)"); for (Element languageElement : languageElements) { Matcher matcher = pattern.matcher(languageElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString( options.getLanguage(), languageElement.text(), matcher.group(1))); } else { md.addSpokenLanguage(matcher.group(1)); } } } } } if (elementText.equals("Certification")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { String languageCode = options.getCountry().getAlpha2(); Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=" + languageCode); boolean done = false; for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); done = true; break; } } if (!done && languageCode.equals("DE")) { certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=XWG"); for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); break; } } } } } } // director Element directorsElement = doc.getElementById("directors"); while (directorsElement != null && directorsElement.tag().getName() != "header") { directorsElement = directorsElement.parent(); } if (directorsElement != null) { directorsElement = directorsElement.nextElementSibling(); } if (directorsElement != null) { for (Element directorElement : directorsElement.getElementsByClass("name")) { String director = directorElement.text().trim(); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR); cm.setName(director); md.addCastMember(cm); } } // actors Element castTableElement = doc.getElementsByClass("cast_list").first(); if (castTableElement != null) { Elements tr = castTableElement.getElementsByTag("tr"); for (Element row : tr) { MediaCastMember cm = parseCastMember(row); if (cm != null && StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(cm); } } } // writers Element writersElement = doc.getElementById("writers"); while (writersElement != null && writersElement.tag().getName() != "header") { writersElement = writersElement.parent(); } if (writersElement != null) { writersElement = writersElement.nextElementSibling(); } if (writersElement != null) { Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element writerElement : writersElements) { String writer = cleanString(writerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER); cm.setName(writer); md.addCastMember(cm); } } // producers Element producersElement = doc.getElementById("producers"); while (producersElement != null && producersElement.tag().getName() != "header") { producersElement = producersElement.parent(); } if (producersElement != null) { producersElement = producersElement.nextElementSibling(); } if (producersElement != null) { Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element producerElement : producersElements) { String producer = cleanString(producerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER); cm.setName(producer); md.addCastMember(cm); } } // producers Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title"); Element prodCompHeaderElement = null; for (Element possibleProdCompHeaderEl : prodCompHeaderElements) { if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) { prodCompHeaderElement = possibleProdCompHeaderEl; break; } } while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") { prodCompHeaderElement = prodCompHeaderElement.parent(); } if (prodCompHeaderElement != null) { prodCompHeaderElement = prodCompHeaderElement.nextElementSibling(); } if (prodCompHeaderElement != null) { Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href", "/company/"); for (Element prodCompElement : prodCompElements) { String prodComp = prodCompElement.ownText(); md.addProductionCompany(prodComp); } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parsePlotsummaryPage(Document doc, MediaScrapeOptions options, MediaMetadata md) { // imdb.com has another site structure if (getImdbSite() == ImdbSiteDefinition.IMDB_COM) { // first check synopsis content // Element zebraList = doc.getElementById("plot-synopsis-content"); // if (zebraList != null) { // Elements p = zebraList.getElementsByClass("ipl-zebra-list__item"); // if (!p.isEmpty()) { // Element em = p.get(0); // if (!"no-synopsis-content".equals(em.id())) { // String plot = cleanString(em.text()); // md.setPlot(plot); // }/*from ww w.j a va2 s. c o m*/ // } // } // NOPE: synopsis contains spoilers // just take first summary // <li class="ipl-zebra-list__item" id="summary-ps21700000"> // <p>text text text text </p> // <div class="author-container"> // <em>—<a href="/search/title?plot_author=author">Author Name</a></em> // </div> // </li> Element zebraList = doc.getElementById("plot-summaries-content"); if (zebraList != null) { Elements p = zebraList.getElementsByClass("ipl-zebra-list__item"); if (!p.isEmpty()) { Element em = p.get(0); // remove author Elements authors = em.getElementsByClass("author-container"); if (!authors.isEmpty()) { authors.get(0).remove(); } if (!"no-summary-content".equals(em.id())) { String plot = cleanString(em.text()); md.setPlot(plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.setPlot(plot); } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaCastMember parseCastMember(Element row) { Element nameElement = row.getElementsByAttributeValueStarting("itemprop", "name").first(); if (nameElement == null) { return null; }/*from ww w . j a v a2s . c om*/ String name = cleanString(nameElement.ownText()); String characterName = ""; Element characterElement = row.getElementsByClass("character").first(); if (characterElement != null) { characterName = cleanString(characterElement.text()); // and now strip off trailing commentaries like - (120 episodes, // 2006-2014) characterName = characterName.replaceAll("\\(.*?\\)$", "").trim(); } String image = ""; Element imageElement = row.getElementsByTag("img").first(); if (imageElement != null) { String imageSrc = imageElement.attr("loadlate"); if (!StringUtils.isEmpty(imageSrc)) { int fileStart = imageSrc.lastIndexOf("/"); if (fileStart > 0) { // parse out the rescale/crop params int parameterStart = imageSrc.indexOf("._", fileStart); if (parameterStart > 0) { int startOfExtension = imageSrc.lastIndexOf("."); if (startOfExtension > parameterStart) { // rebuild the path - scaled to 632 px height as in tmdb scraper imageSrc = imageSrc.substring(0, parameterStart) + "._UY632" + imageSrc.substring(startOfExtension); } } } image = imageSrc; } } MediaCastMember cm = new MediaCastMember(); cm.setCharacter(characterName); cm.setName(name); cm.setImageUrl(image); return cm; }
From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java
private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) { Pattern unknownPattern = Pattern.compile("Unknown"); Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)"); int episodeCounter = 0; // parse episodes Elements tables = doc.getElementsByClass("eplist"); for (Element table : tables) { Elements rows = table.getElementsByClass("list_item"); for (Element row : rows) { Matcher matcher = season == 0 ? unknownPattern.matcher(row.text()) : seasonEpisodePattern.matcher(row.text()); if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) { try { // we found a row containing episode data MediaEpisode ep = new MediaEpisode(providerInfo.getId()); // parse season and ep number if (season == 0) { ep.season = season; ep.episode = ++episodeCounter; } else { ep.season = Integer.parseInt(matcher.group(1)); ep.episode = Integer.parseInt(matcher.group(2)); }/*from ww w. j ava 2s. com*/ // check if we have still valid data if (season > 0 && season != ep.season) { return false; } // get ep title and id Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt"); for (Element anchor : anchors) { if ("name".equals(anchor.attr("itemprop"))) { ep.title = anchor.text(); break; } } String id = ""; Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href")); while (idMatcher.find()) { if (idMatcher.group(1) != null) { id = idMatcher.group(1); } } if (StringUtils.isNotBlank(id)) { ep.ids.put(providerInfo.getId(), id); } // plot Element plot = row.getElementsByClass("item_description").first(); if (plot != null) { ep.plot = plot.ownText(); } // rating and rating count Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { ep.rating = Float.valueOf(ratingAsString); } catch (Exception ignored) { } Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim(); try { ep.voteCount = Integer.parseInt(countAsString); } catch (Exception ignored) { } } } // release date Element releaseDate = row.getElementsByClass("airdate").first(); if (releaseDate != null) { ep.firstAired = releaseDate.ownText(); } // poster Element image = row.getElementsByTag("img").first(); if (image != null) { String posterUrl = image.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); if (StringUtils.isNotBlank(posterUrl)) { MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(), MediaArtwork.MediaArtworkType.THUMB); ma.setPreviewUrl(posterUrl); ma.setDefaultUrl(posterUrl); ep.artwork.add(ma); } } episodes.add(ep); } catch (Exception e) { LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage()); } } } } return true; }
From source file:org.xlrnet.metadict.engines.nobordbok.OrdbokEngine.java
@NotNull private Optional<MonolingualEntry> processTableRow(@NotNull Element tableRow, @NotNull Language language) { MonolingualEntryBuilder entryBuilder = ImmutableMonolingualEntry.builder(); DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder().setLanguage(language); // Extract general form Element oppslagsord = tableRow.getElementsByClass("oppslagsord").first(); if (oppslagsord != null) { extractGeneralForm(objectBuilder, oppslagsord); } else {//from www. j av a 2 s. com LOGGER.warn("Unable to find main element - skipping entry."); return Optional.empty(); } // Extract wordclass and determine entrytype String wordClass = tableRow.getElementsByClass("oppsgramordklasse").first().text(); entryBuilder.setEntryType(resolveEntryTypeWithWordClass(wordClass)); // Get meanings Elements meaningCandidates = tableRow.select(".artikkelinnhold > .utvidet > .tyding"); if (meaningCandidates.size() == 0) meaningCandidates = tableRow.select(".artikkelinnhold > .utvidet"); meaningCandidates.forEach(e -> { String meaning = e.childNodes().stream() .filter(node -> (node instanceof TextNode) || (!((Element) node).hasClass("doemeliste") && !node.hasAttr("style") && !((Element) node).hasClass("utvidet") && !((Element) node).hasClass("artikkelinnhold") && !((Element) node).hasClass("kompakt"))) .map((Node n) -> { if (n instanceof Element) return ((Element) n).text(); else return n.toString(); }).collect(Collectors.joining()); meaning = StringEscapeUtils.unescapeHtml4(meaning); meaning = StringUtils.strip(meaning); if (StringUtils.isNotBlank(meaning)) objectBuilder.addMeaning(meaning); }); entryBuilder.setContent(objectBuilder.build()); return Optional.of(entryBuilder.build()); }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void extractBilingualSynonyms(@NotNull String queryString, @NotNull Element synonymsTable, @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage) { List<Element> synonymNodes = synonymsTable.select("tr").stream() .filter(e -> e.getElementsByTag("th").size() == 0).collect(Collectors.toList()); if (synonymNodes.size() == 0) { LOGGER.debug("No synonym entries found"); return;/*from w w w . j a va 2 s . c o m*/ } String synonymEntryTitle = synonymsTable.select("span.hl").first().text(); Map<String, SynonymGroupBuilder> synonymGroupMap = new HashMap<>(); for (Element synonymNode : synonymNodes) { // Extract only information from the "from"-node (i.e. source language) DictionaryObject newSynonym = processSingleNode( synonymNode.getElementsByClass(CLASS_TRANSLATION).get(0), sourceLanguage, queryString); String groupName = newSynonym.getDescription() != null ? newSynonym.getDescription() : queryString; if (groupName != null) { SynonymGroupBuilder groupBuilder = synonymGroupMap.computeIfAbsent(groupName, (s) -> ImmutableSynonymGroup.builder() .setBaseMeaning(ImmutableDictionaryObject.createSimpleObject(sourceLanguage, s))); groupBuilder.addSynonym(newSynonym); } else { LOGGER.warn("Synonym group is null"); } } SynonymEntryBuilder synonymEntryBuilder = ImmutableSynonymEntry.builder() .setBaseObject(ImmutableDictionaryObject.createSimpleObject(sourceLanguage, synonymEntryTitle)); for (SynonymGroupBuilder synonymGroupBuilder : synonymGroupMap.values()) { synonymEntryBuilder.addSynonymGroup(synonymGroupBuilder.build()); } resultBuilder.addSynonymEntry(synonymEntryBuilder.build()); }