List of usage examples for org.jsoup.nodes Element ownText
public String ownText()
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
MediaMetadata getMovieMetadata(MediaScrapeOptions options) throws Exception { MediaMetadata md = new MediaMetadata(providerInfo.getId()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMediaMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMediaMetadata(); }/*w ww . j a v a 2s . c o m*/ String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(providerInfo.getId(), imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<>(executor); // worker for imdb request (/reference) (everytime from www.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/reference"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); Future<Document> futureReference = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureReference.get(); parseReferencePage(doc, options, md); /* * plot from /plotsummary */ // build the url doc = futurePlotsummary.get(); parsePlotsummaryPage(doc, options, md); // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { Element title = doc.getElementById("tn15title"); if (title != null) { Element element; // title Elements elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.setTitle(movieTitle); } } } // get the release info page Future<Document> futureReleaseinfo; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/releaseinfo"); worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); futureReleaseinfo = compSvcImdb.submit(worker); doc = futureReleaseinfo.get(); // parse original title here!! parseReleaseinfoPageAKAs(doc, options, md); // did we get a release date? if (md.getReleaseDate() == null || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("localReleaseDate")) { parseReleaseinfoPage(doc, options, md); } // get data from tmdb? if (futureTmdb != null && (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo"))) { try { MediaMetadata tmdbMd = futureTmdb.get(); if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") && tmdbMd != null) { // tmdbid md.setId(MediaMetadata.TMDB, tmdbMd.getId(MediaMetadata.TMDB)); // title if (StringUtils.isNotBlank(tmdbMd.getTitle())) { md.setTitle(tmdbMd.getTitle()); } // original title if (StringUtils.isNotBlank(tmdbMd.getOriginalTitle())) { md.setOriginalTitle(tmdbMd.getOriginalTitle()); } // tagline if (StringUtils.isNotBlank(tmdbMd.getTagline())) { md.setTagline(tmdbMd.getTagline()); } // plot if (StringUtils.isNotBlank(tmdbMd.getPlot())) { md.setPlot(tmdbMd.getPlot()); } // collection info if (StringUtils.isNotBlank(tmdbMd.getCollectionName())) { md.setCollectionName(tmdbMd.getCollectionName()); md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET)); } } if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo") && tmdbMd != null) { md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET)); md.setCollectionName(tmdbMd.getCollectionName()); } md.setId(tmdbMd.getProviderId(), tmdbMd.getId(tmdbMd.getProviderId())); } catch (Exception ignored) { } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getOriginalTitle())) { md.setOriginalTitle(md.getTitle()); } // populate id md.setId(ImdbMetadataProvider.providerInfo.getId(), imdbId); return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) { /*//w w w.j a v a2 s . c o m * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // title Element title = doc.getElementsByAttributeValue("name", "title").first(); if (title != null) { String movieTitle = cleanString(title.attr("content")); int yearStart = movieTitle.lastIndexOf("("); if (yearStart > 0) { movieTitle = movieTitle.substring(0, yearStart - 1).trim(); md.setTitle(movieTitle); } } // original title and year Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first(); if (originalTitleYear != null) { String content = originalTitleYear.attr("content"); int startOfYear = content.lastIndexOf("("); if (startOfYear > 0) { // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page... // String originalTitle = content.substring(0, startOfYear - 1).trim(); // md.setOriginalTitle(originalTitle); String yearText = content.substring(startOfYear); // search year Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}"); Matcher matcher = yearPattern.matcher(yearText); while (matcher.find()) { if (matcher.group(0) != null) { String movieYear = matcher.group(0); try { md.setYear(Integer.parseInt(movieYear)); break; } catch (Exception ignored) { } } } } } // poster Element poster = doc.getElementsByAttributeValue("property", "og:image").first(); if (poster != null) { String posterUrl = poster.attr("content"); int fileStart = posterUrl.lastIndexOf("/"); if (fileStart > 0) { int parameterStart = posterUrl.indexOf("_", fileStart); if (parameterStart > 0) { int startOfExtension = posterUrl.lastIndexOf("."); if (startOfExtension > parameterStart) { posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension); } } } processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { md.setRating(Float.valueOf(ratingAsString)); } catch (Exception ignored) { } Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim(); try { md.setVoteCount(Integer.parseInt(countAsString)); } catch (Exception ignored) { } } } // top250 Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first(); if (topRatedElement != null) { Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(topRatedElement.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { try { String top250Text = matcher.group(1); md.setTop250(Integer.parseInt(top250Text)); } catch (Exception ignored) { } } } } // releasedate Element releaseDateElement = doc .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo") .first(); if (releaseDateElement != null) { String releaseDateText = releaseDateElement.ownText(); int startOfCountry = releaseDateText.indexOf("("); if (startOfCountry > 0) { releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim(); } try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException ignored) { } } } Elements elements = doc.getElementsByClass("ipl-zebra-list__label"); for (Element element : elements) { // only parse tds if (!"td".equals(element.tag().getName())) { continue; } String elementText = element.ownText(); if (elementText.equals("Taglines")) { if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) { Element taglineElement = element.nextElementSibling(); if (taglineElement != null) { String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.setTagline(tagline); } } } if (elementText.equals("Genres")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/"); for (Element genreElement : genreElements) { String genreText = genreElement.ownText(); md.addGenre(getTmmGenre(genreText)); } } } /* * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) * | 178 min (extended cut)</div></div> */ if (elementText.equals("Runtime")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first(); if (runtimeElement != null) { String first = runtimeElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.setRuntime(runtime); } } } if (elementText.equals("Country")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/"); Pattern pattern = Pattern.compile("/country/(.*)"); for (Element countryElement : countryElements) { Matcher matcher = pattern.matcher(countryElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addCountry(LanguageUtils.getLocalizedCountryForLanguage( options.getLanguage().getLanguage(), countryElement.text(), matcher.group(1))); } else { md.addCountry(matcher.group(1)); } } } } } if (elementText.equals("Language")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements languageElements = nextElement.getElementsByAttributeValueStarting("href", "/language/"); Pattern pattern = Pattern.compile("/language/(.*)"); for (Element languageElement : languageElements) { Matcher matcher = pattern.matcher(languageElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString( options.getLanguage(), languageElement.text(), matcher.group(1))); } else { md.addSpokenLanguage(matcher.group(1)); } } } } } if (elementText.equals("Certification")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { String languageCode = options.getCountry().getAlpha2(); Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=" + languageCode); boolean done = false; for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); done = true; break; } } if (!done && languageCode.equals("DE")) { certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=XWG"); for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); break; } } } } } } // director Element directorsElement = doc.getElementById("directors"); while (directorsElement != null && directorsElement.tag().getName() != "header") { directorsElement = directorsElement.parent(); } if (directorsElement != null) { directorsElement = directorsElement.nextElementSibling(); } if (directorsElement != null) { for (Element directorElement : directorsElement.getElementsByClass("name")) { String director = directorElement.text().trim(); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR); cm.setName(director); md.addCastMember(cm); } } // actors Element castTableElement = doc.getElementsByClass("cast_list").first(); if (castTableElement != null) { Elements tr = castTableElement.getElementsByTag("tr"); for (Element row : tr) { MediaCastMember cm = parseCastMember(row); if (cm != null && StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(cm); } } } // writers Element writersElement = doc.getElementById("writers"); while (writersElement != null && writersElement.tag().getName() != "header") { writersElement = writersElement.parent(); } if (writersElement != null) { writersElement = writersElement.nextElementSibling(); } if (writersElement != null) { Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element writerElement : writersElements) { String writer = cleanString(writerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER); cm.setName(writer); md.addCastMember(cm); } } // producers Element producersElement = doc.getElementById("producers"); while (producersElement != null && producersElement.tag().getName() != "header") { producersElement = producersElement.parent(); } if (producersElement != null) { producersElement = producersElement.nextElementSibling(); } if (producersElement != null) { Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element producerElement : producersElements) { String producer = cleanString(producerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER); cm.setName(producer); md.addCastMember(cm); } } // producers Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title"); Element prodCompHeaderElement = null; for (Element possibleProdCompHeaderEl : prodCompHeaderElements) { if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) { prodCompHeaderElement = possibleProdCompHeaderEl; break; } } while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") { prodCompHeaderElement = prodCompHeaderElement.parent(); } if (prodCompHeaderElement != null) { prodCompHeaderElement = prodCompHeaderElement.nextElementSibling(); } if (prodCompHeaderElement != null) { Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href", "/company/"); for (Element prodCompElement : prodCompElements) { String prodComp = prodCompElement.ownText(); md.addProductionCompany(prodComp); } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parsePlotsummaryPage(Document doc, MediaScrapeOptions options, MediaMetadata md) { // imdb.com has another site structure if (getImdbSite() == ImdbSiteDefinition.IMDB_COM) { // first check synopsis content // Element zebraList = doc.getElementById("plot-synopsis-content"); // if (zebraList != null) { // Elements p = zebraList.getElementsByClass("ipl-zebra-list__item"); // if (!p.isEmpty()) { // Element em = p.get(0); // if (!"no-synopsis-content".equals(em.id())) { // String plot = cleanString(em.text()); // md.setPlot(plot); // }/* w w w . j a v a 2 s. c o m*/ // } // } // NOPE: synopsis contains spoilers // just take first summary // <li class="ipl-zebra-list__item" id="summary-ps21700000"> // <p>text text text text </p> // <div class="author-container"> // <em>—<a href="/search/title?plot_author=author">Author Name</a></em> // </div> // </li> Element zebraList = doc.getElementById("plot-summaries-content"); if (zebraList != null) { Elements p = zebraList.getElementsByClass("ipl-zebra-list__item"); if (!p.isEmpty()) { Element em = p.get(0); // remove author Elements authors = em.getElementsByClass("author-container"); if (!authors.isEmpty()) { authors.get(0).remove(); } if (!"no-summary-content".equals(em.id())) { String plot = cleanString(em.text()); md.setPlot(plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.setPlot(plot); } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaCastMember parseCastMember(Element row) { Element nameElement = row.getElementsByAttributeValueStarting("itemprop", "name").first(); if (nameElement == null) { return null; }/*from w w w. j a va 2 s. c o m*/ String name = cleanString(nameElement.ownText()); String characterName = ""; Element characterElement = row.getElementsByClass("character").first(); if (characterElement != null) { characterName = cleanString(characterElement.text()); // and now strip off trailing commentaries like - (120 episodes, // 2006-2014) characterName = characterName.replaceAll("\\(.*?\\)$", "").trim(); } String image = ""; Element imageElement = row.getElementsByTag("img").first(); if (imageElement != null) { String imageSrc = imageElement.attr("loadlate"); if (!StringUtils.isEmpty(imageSrc)) { int fileStart = imageSrc.lastIndexOf("/"); if (fileStart > 0) { // parse out the rescale/crop params int parameterStart = imageSrc.indexOf("._", fileStart); if (parameterStart > 0) { int startOfExtension = imageSrc.lastIndexOf("."); if (startOfExtension > parameterStart) { // rebuild the path - scaled to 632 px height as in tmdb scraper imageSrc = imageSrc.substring(0, parameterStart) + "._UY632" + imageSrc.substring(startOfExtension); } } } image = imageSrc; } } MediaCastMember cm = new MediaCastMember(); cm.setCharacter(characterName); cm.setName(name); cm.setImageUrl(image); return cm; }
From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java
private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) { Pattern unknownPattern = Pattern.compile("Unknown"); Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)"); int episodeCounter = 0; // parse episodes Elements tables = doc.getElementsByClass("eplist"); for (Element table : tables) { Elements rows = table.getElementsByClass("list_item"); for (Element row : rows) { Matcher matcher = season == 0 ? unknownPattern.matcher(row.text()) : seasonEpisodePattern.matcher(row.text()); if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) { try { // we found a row containing episode data MediaEpisode ep = new MediaEpisode(providerInfo.getId()); // parse season and ep number if (season == 0) { ep.season = season; ep.episode = ++episodeCounter; } else { ep.season = Integer.parseInt(matcher.group(1)); ep.episode = Integer.parseInt(matcher.group(2)); }//from w w w .ja v a 2 s . co m // check if we have still valid data if (season > 0 && season != ep.season) { return false; } // get ep title and id Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt"); for (Element anchor : anchors) { if ("name".equals(anchor.attr("itemprop"))) { ep.title = anchor.text(); break; } } String id = ""; Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href")); while (idMatcher.find()) { if (idMatcher.group(1) != null) { id = idMatcher.group(1); } } if (StringUtils.isNotBlank(id)) { ep.ids.put(providerInfo.getId(), id); } // plot Element plot = row.getElementsByClass("item_description").first(); if (plot != null) { ep.plot = plot.ownText(); } // rating and rating count Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { ep.rating = Float.valueOf(ratingAsString); } catch (Exception ignored) { } Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim(); try { ep.voteCount = Integer.parseInt(countAsString); } catch (Exception ignored) { } } } // release date Element releaseDate = row.getElementsByClass("airdate").first(); if (releaseDate != null) { ep.firstAired = releaseDate.ownText(); } // poster Element image = row.getElementsByTag("img").first(); if (image != null) { String posterUrl = image.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); if (StringUtils.isNotBlank(posterUrl)) { MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(), MediaArtwork.MediaArtworkType.THUMB); ma.setPreviewUrl(posterUrl); ma.setDefaultUrl(posterUrl); ep.artwork.add(ma); } } episodes.add(ep); } catch (Exception e) { LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage()); } } } } return true; }
From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception { LOGGER.debug("search() " + options.toString()); List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>(); String searchUrl = ""; String searchTerm = ""; String imdb = ""; // only title search if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search for everything: " + searchTerm); } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search with title: " + searchTerm); } else {/*from w w w .j a v a2 s.c o m*/ LOGGER.debug("empty searchString"); return resultList; } searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); Document doc = null; try { Url url = new CachedUrl(searchUrl); InputStream in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(searchUrl); } if (doc == null) { return resultList; } // only look for movie links Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php"); LOGGER.debug("found " + filme.size() + " search results"); if (filme.isEmpty()) { if (!doc.getElementsByTag("title").text().contains("Suche nach")) { // redirected to detail page MediaSearchResult msr = new MediaSearchResult(providerInfo.getId()); Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id="); if (el.size() > 0) { msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)")); } msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim()); el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { msr.setYear(el.get(0).text()); } resultList.add(msr); } return resultList; } // <a // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1" // class="normLight">Avatar - Aufbruch nach Pandora</B> // <nobr>(2009)</nobr><br /><span class="smallLight" // style="color:#ccc;">Avatar</span></a> // map to merge 2 results :/ Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>(); for (Element a : filme) { try { String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-"); MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); if (res.containsKey(id)) { LOGGER.debug("dupe found; merging with previous searchresult"); sr = res.get(id); } if (StringUtils.isNotEmpty(imdb)) { sr.setIMDBId(imdb); } if (StringUtils.isEmpty(sr.getId())) { sr.setId(id); } if (StringUtils.isEmpty(sr.getTitle())) { if (a.html().contains("nobr")) { sr.setTitle(a.ownText()); } else { sr.setTitle(a.text()); } } LOGGER.debug("found movie " + sr.getTitle()); if (StringUtils.isEmpty(sr.getOriginalTitle())) { sr.setOriginalTitle(a.getElementsByTag("span").text()); } if (StringUtils.isEmpty(sr.getYear())) { sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any // 4 // digit } sr.setMediaType(MediaType.MOVIE); sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id); // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), // "images(.*?)\\"")); if (imdb.equals(sr.getIMDBId())) { // perfect match sr.setScore(1); } else { // compare score based on names sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle())); } // populate extra args MetadataUtil.copySearchQueryToSearchResult(options, sr); res.put(id, sr); } catch (Exception e) { LOGGER.warn("error parsing movie result: " + e.getMessage()); } } for (String r : res.keySet()) { resultList.add(res.get(r)); } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }
From source file:org.wandora.application.tools.extractors.bookmark.BookmarkExtractor.java
private void parseCategory(Element c, Topic parent, TopicMap t) throws TopicMapException, ParseException { Topic cTopic = parent;//from ww w.j av a 2 s . c o m Elements children = c.children(); for (Element child : children) { if (child.tagName().equals("h3")) { String cLocator = parent.getSubjectLocator().toString(); cLocator += "/" + urlEncode(child.html()); String cName = child.ownText(); cTopic = getOrCreateTopic(t, cLocator); cTopic.setSubjectLocator(new Locator(cLocator)); cTopic.setBaseName(cName + " (Bookmark)"); cTopic.setDisplayName(LANG, cName); makeSubclassOf(t, cTopic, parent); } } for (Element child : children) { if (!child.tagName().equals("dl")) continue; for (Element grandChild : child.children()) { if (!grandChild.tagName().equals("dt")) continue; for (Element ggChild : grandChild.children()) { if (ggChild.tagName().equals("a")) parseItem(ggChild, cTopic, t); else if (ggChild.tagName().equals("dl")) parseCategory(grandChild, cTopic, t); } } } }
From source file:org.wandora.application.tools.extractors.bookmark.BookmarkExtractor.java
private void parseItem(Element i, Topic parent, TopicMap t) throws TopicMapException, ParseException { String cLocator = i.attr("href"); Topic iTopic = getOrCreateTopic(t, cLocator); iTopic.setSubjectLocator(new Locator(cLocator)); iTopic.setBaseName(i.ownText() + " (Bookmark)"); iTopic.setDisplayName(LANG, i.ownText()); iTopic.addType(parent);// w ww . j av a2 s . co m String attr; Topic type; for (int j = 0; j < itemProps.length; j++) { type = getOrCreateTopic(t, itemProps[j][0], itemProps[j][2]); attr = i.attr(itemProps[j][1]); if (attr.length() > 0) iTopic.setData(type, langTopic, attr); } long timeStamp; for (int j = 0; j < itemTimeProps.length; j++) { type = getOrCreateTopic(t, itemTimeProps[j][0], itemTimeProps[j][2]); attr = i.attr(itemTimeProps[j][1]); if (attr.length() > 0) { timeStamp = Integer.parseInt(attr); timeStamp *= 1000; iTopic.setData(type, langTopic, df.format(new Date(timeStamp))); } } }
From source file:uk.co.certait.htmlexporter.writer.AbstractTableCellWriter.java
/** * Returns the actual text of the innermost child element for this cell. * //from w w w. j a va 2 s . com * @param element * * @return The text to be output for this Cell. */ public String getElementText(Element element) { String text = element.ownText(); for (Element child : element.children()) { text = child.ownText(); } return text; }
From source file:uk.co.certait.htmlexporter.writer.AbstractTableCellWriter.java
/** * // w w w. j a v a 2 s . c om * @param element * * @return */ public Double getNumericValue(Element element) { Double numericValue = null; if (!element.hasAttr(DATA_TEXT_CELL)) try { numericValue = NumberFormat.getInstance().parse(element.ownText()).doubleValue(); } catch (ParseException e) { } return numericValue; }