List of usage examples for org.jsoup.select Elements first
public Element first()
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); if (options.getType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getType()); }//from w ww . j av a 2 s . com // we have 3 entry points here // a) getMetadata has been called with an ofdbId // b) getMetadata has been called with an imdbId // c) getMetadata has been called from a previous search String detailUrl = ""; // case a) and c) if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) { if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) { detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId()); } else { detailUrl = options.getResult().getUrl(); } } // case b) if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) { MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE); searchOptions.setImdbId(options.getId(MediaMetadata.IMDB)); try { List<MediaSearchResult> results = search(searchOptions); if (results != null && !results.isEmpty()) { options.setResult(results.get(0)); detailUrl = options.getResult().getUrl(); } } catch (Exception e) { LOGGER.warn("failed IMDB search: " + e.getMessage()); } } // we can only work further if we got a search result on ofdb.de if (StringUtils.isBlank(detailUrl)) { throw new Exception("We did not get any useful movie url"); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),"); if (StringUtils.isBlank(ofdbId)) { ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)"); } Url url; try { LOGGER.trace("get details page"); url = new Url(detailUrl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); if (doc.getAllElements().size() < 10) { throw new Exception("meh - we did not receive a valid web page"); } // parse details // IMDB ID "http://www.imdb.com/Title?1194173" el = doc.getElementsByAttributeValueContaining("href", "imdb.com"); if (!el.isEmpty()) { md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)")); } // title / year // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" /> el = doc.getElementsByAttributeValue("property", "og:title"); if (!el.isEmpty()) { String[] ty = parseTitle(el.first().attr("content")); md.setTitle(StrgUtils.removeCommonSortableName(ty[0])); try { md.setYear(Integer.parseInt(ty[1])); } catch (Exception ignored) { } } // another year position if (md.getYear() == 0) { // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a> el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr"); try { md.setYear(Integer.parseInt(el.first().text())); } catch (Exception ignored) { } } // original title (has to be searched with a regexp) // <tr valign="top"> // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif" // size="2">Originaltitel:</font></td> // <td> </td> // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif" // size="2"><b>Brave</b></font></td> // </tr> String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>"); if (!originalTitle.isEmpty()) { md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle)); } // Genre: <a href="view.php?page=genre&Genre=Action">Action</a> el = doc.getElementsByAttributeValueContaining("href", "page=genre"); for (Element g : el) { md.addGenre(getTmmGenre(g.text())); } // rating // <div itemtype="http://schema.org/AggregateRating" itemscope // itemprop="aggregateRating">Note: <span // itemprop="ratingValue">6.73</span><meta // itemprop="worstRating" content="1" /> el = doc.getElementsByAttributeValue("itemprop", "ratingValue"); if (!el.isEmpty()) { String r = el.text(); if (!r.isEmpty()) { try { md.setRating(Float.parseFloat(r)); } catch (Exception e) { LOGGER.debug("could not parse rating"); } } } // get PlotLink; open url and parse // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a> LOGGER.trace("parse plot"); el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,"); if (!el.isEmpty()) { String plotUrl = BASE_URL + "/" + el.first().attr("href"); try { url = new Url(plotUrl); in = url.getInputStream(); Document plot = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements block = plot.getElementsByClass("Blocksatz"); // first // Blocksatz // is plot String p = block.first().text(); // remove all html stuff p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header" md.setPlot(p); } catch (Exception e) { LOGGER.error("failed to get plot page: " + e.getMessage()); } } // http://www.ofdb.de/view.php?page=film_detail&fid=226745 LOGGER.debug("parse actor detail"); String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId; doc = null; try { url = new Url(movieDetail); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get detail page: " + e.getMessage()); } if (doc != null) { parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md); parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER, md); parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md); } } catch (Exception e) { LOGGER.error("Error parsing " + detailUrl); throw e; } return md; }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public List<MediaTrailer> getTrailers(MediaScrapeOptions options) throws Exception { LOGGER.debug("getTrailers() " + options.toString()); List<MediaTrailer> trailers = new ArrayList<>(); if (!MetadataUtil.isValidImdbId(options.getImdbId())) { LOGGER.debug("IMDB id not found"); return trailers; }/* w w w.ja v a2s . c o m*/ /* * function getTrailerData(ci) { switch (ci) { case 'http://de.clip-1.filmtrailer.com/9507_31566_a_1.flv?log_var=72|491100001 -1|-' : return * '<b>Trailer 1</b><br><i>(small)</i><br><br>» 160px<br><br>Download:<br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_1.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_2.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(medium)</i><br><br>» * 240px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_2.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_3.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(large)</i><br><br>» * 320px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_3.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_3.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_3.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_4.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(xlarge)</i><br><br>» * 400px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_4.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_4.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_4.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_31566_a_5.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(xxlarge)</i><br><br>» * 640px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_31566_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_1.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(small)</i><br><br>» * 160px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_1.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_2.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(medium)</i><br><br>» * 240px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_2.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_3.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(large)</i><br><br>» * 320px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_3.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_3.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_3.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_4.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(xlarge)</i><br><br>» * 400px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_4.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_4.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_4.webm?log_var=72|491100001-1|-" >webm</a><br>'; case * 'http://de.clip-1.filmtrailer.com/9507_39003_a_5.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(xxlarge)</i><br><br>» * 640px<br><br>Download:<br>» <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= * "http://de.clip-1.filmtrailer.com/9507_39003_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>'; } } */ Url url = null; String searchString = BASE_URL + "/view.php?page=suchergebnis&Kat=IMDb&SText=" + options.getImdbId(); try { // search with IMDB url = new Url(searchString); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements filme = doc.getElementsByAttributeValueMatching("href", "film\\/\\d+,"); if (filme == null || filme.isEmpty()) { LOGGER.debug("found no search results"); return trailers; } LOGGER.debug("found " + filme.size() + " search results"); // hopefully // only one LOGGER.debug("get (trailer) details page"); url = new Url(BASE_URL + "/" + StrgUtils.substr(filme.first().toString(), "href=\\\"(.*?)\\\"")); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); // OLD STYLE // <b>Trailer 1</b><br><i>(xxlarge)</i><br><br>» 640px<br><br>Download:<br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>» <a href= // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.webm?log_var=72|491100001-1|-" >webm</a><br> Pattern regex = Pattern.compile("return '(.*?)';"); Matcher m = regex.matcher(doc.toString()); while (m.find()) { String s = m.group(1); String tname = StrgUtils.substr(s, "<b>(.*?)</b>"); String tpix = StrgUtils.substr(s, "raquo; (.*?)x<br>"); // String tqual = StrgUtils.substr(s, "<i>\\((.*?)\\)</i>"); // url + format Pattern lr = Pattern.compile("<a href=\"(.*?)\">(.*?)</a>"); Matcher lm = lr.matcher(s); while (lm.find()) { String turl = lm.group(1); // String tformat = lm.group(2); MediaTrailer trailer = new MediaTrailer(); trailer.setName(tname); // trailer.setQuality(tpix + " (" + tformat + ")"); trailer.setQuality(tpix); trailer.setProvider("filmtrailer"); trailer.setUrl(turl); LOGGER.debug(trailer.toString()); trailers.add(trailer); } } // NEW STYLE (additional!) // <div class="clips" id="clips2" style="display: none;"> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 1:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 2:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12"> // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren"> // <i>Trailer 3:</i> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_1.flv?log_var=67|491100001-1|-"> small </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_2.flv?log_var=67|491100001-1|-"> medium </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_3.flv?log_var=67|491100001-1|-"> large </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_4.flv?log_var=67|491100001-1|-"> xlarge </a> // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_5.flv?log_var=67|491100001-1|-"> xxlarge </a> // <br> // <br> // </div> // new style size // 1 = 160 x 90 = small // 2 = 240 x 136 = medium // 3 = 320 x 180 = large // 4 = 400 x 226 = xlarge // 5 = 640 x 360 = xxlarge ; regex = Pattern.compile("<i>(.*?)</i>(.*?)<br>", Pattern.DOTALL); // get them as single trailer line m = regex.matcher(doc.getElementsByClass("clips").html()); while (m.find()) { // LOGGER.info(doc.getElementsByClass("clips").html()); // parse each line with 5 qualities String tname = m.group(1).trim(); tname = tname.replaceFirst(":$", ""); // replace ending colon String urls = m.group(2); // url + format Pattern lr = Pattern.compile("<a href=\"(.*?)\">(.*?)</a>"); Matcher lm = lr.matcher(urls); while (lm.find()) { String turl = lm.group(1); String tpix = ""; String tformat = lm.group(2).replaceAll(" ", "").trim(); switch (tformat) { case "small": tpix = "90p"; break; case "medium": tpix = "136p"; break; case "large": tpix = "180p"; break; case "xlarge": tpix = "226p"; break; case "xxlarge": tpix = "360p"; break; default: break; } MediaTrailer trailer = new MediaTrailer(); trailer.setName(tname); // trailer.setQuality(tpix + " (" + tformat + ")"); trailer.setQuality(tpix); trailer.setProvider("filmtrailer"); trailer.setUrl(turl); LOGGER.debug(trailer.toString()); trailers.add(trailer); } } } catch (Exception e) { if (url != null) { LOGGER.error("Error parsing {}", url.toString()); } else { LOGGER.error("Error parsing {}", searchString); } throw e; } return trailers; }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private EntryType detectEntryType(@NotNull Element element) { Elements wordTypeNodes = element.getElementsByClass(CLASS_WORDTYPE); if (wordTypeNodes.size() < 1) { LOGGER.debug("No wordType node found - defaulting to {}", EntryType.UNKNOWN); return EntryType.UNKNOWN; }/*from w w w . j a v a 2s . c o m*/ EntryType entryType = ENTRY_TYPE_MAP.getOrDefault(wordTypeNodes.first().text(), EntryType.UNKNOWN); if (entryType == EntryType.UNKNOWN) LOGGER.debug("Unable to resolve entry type \"{}\"", entryType); return entryType; }
From source file:Search.DataManipulation.DataParser.java
public String getIcon(Document dom) throws IOException { Elements iconClass = dom.getElementsByClass("cover-container"); Elements iconClass1 = iconClass.select("img.cover-image[alt=Cover art]"); String iconUrl = iconClass1.first().attr("src"); byte[] iconByte = dataHandler.imageDownloader(iconUrl); if (iconByte.length == 0) { log.warn("Invalid Icon url found by Search.DataManipulation.DataValidator, not adding to appData"); return null; } else {//from w ww . j av a 2 s .c o m String icon = Base64.getEncoder().encodeToString(iconByte); return icon; } }
From source file:Search.DataManipulation.DataParser.java
public String getName(Document dom) { Elements appNameClass = dom.getElementsByClass("document-title"); return appNameClass.first().child(0).ownText(); }
From source file:Search.DataManipulation.DataParser.java
public String getBundleId(Document dom) { Elements bundleClass = dom.getElementsByClass("buy-button-container"); return bundleClass.first().attr("data-docid"); }
From source file:Search.DataManipulation.DataParser.java
public String getDescription(Document dom) { Elements descClass = dom.getElementsByClass("id-app-orig-desc"); return descClass.first().ownText(); }
From source file:Search.DataManipulation.DataParser.java
public String getPrice(Document dom) { Elements priceClass = dom.select("button.price"); Element priceClass1 = priceClass.first(); Elements priceClass2 = priceClass1.getElementsByTag("span"); String price = priceClass2.last().ownText(); if (price.equalsIgnoreCase("install")) { price = "Free"; } else {// w w w . ja va2 s .c o m String[] split = StringUtils.split(price); price = split[0]; } return price; }
From source file:Search.DataManipulation.DataParser.java
public String getCategory(Document dom) { Elements categoryClass = dom.select("a.document-subtitle.category span[itemprop=genre]"); return categoryClass.first().ownText(); }
From source file:Search.DataManipulation.DataParser.java
public String getThumbnails(Document dom) throws IOException { Elements thumbnailsClass = dom.getElementsByClass("thumbnails"); Elements thumbnails = thumbnailsClass.first().children(); List<String> imageArray = new ArrayList<String>(); for (Element images : thumbnails) { String imageTagUrl = images.getElementsByTag("img").first().attr("src"); byte[] imageByte = dataHandler.imageDownloader(imageTagUrl); if (imageByte.length == 0) { continue; }//from w w w.ja va 2s . c o m String imageTag = Base64.getEncoder().encodeToString(imageByte); imageArray.add(imageTag); } return JSONValue.toJSONString(imageArray); }