List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java
private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) { Pattern unknownPattern = Pattern.compile("Unknown"); Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)"); int episodeCounter = 0; // parse episodes Elements tables = doc.getElementsByClass("eplist"); for (Element table : tables) { Elements rows = table.getElementsByClass("list_item"); for (Element row : rows) { Matcher matcher = season == 0 ? unknownPattern.matcher(row.text()) : seasonEpisodePattern.matcher(row.text()); if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) { try { // we found a row containing episode data MediaEpisode ep = new MediaEpisode(providerInfo.getId()); // parse season and ep number if (season == 0) { ep.season = season; ep.episode = ++episodeCounter; } else { ep.season = Integer.parseInt(matcher.group(1)); ep.episode = Integer.parseInt(matcher.group(2)); }/* w w w . ja v a2s . com*/ // check if we have still valid data if (season > 0 && season != ep.season) { return false; } // get ep title and id Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt"); for (Element anchor : anchors) { if ("name".equals(anchor.attr("itemprop"))) { ep.title = anchor.text(); break; } } String id = ""; Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href")); while (idMatcher.find()) { if (idMatcher.group(1) != null) { id = idMatcher.group(1); } } if (StringUtils.isNotBlank(id)) { ep.ids.put(providerInfo.getId(), id); } // plot Element plot = row.getElementsByClass("item_description").first(); if (plot != null) { ep.plot = plot.ownText(); } // rating and rating count Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { ep.rating = Float.valueOf(ratingAsString); } catch (Exception ignored) { } Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim(); try { ep.voteCount = Integer.parseInt(countAsString); } catch (Exception ignored) { } } } // release date Element releaseDate = row.getElementsByClass("airdate").first(); if (releaseDate != null) { ep.firstAired = releaseDate.ownText(); } // poster Element image = row.getElementsByTag("img").first(); if (image != null) { String posterUrl = image.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); if (StringUtils.isNotBlank(posterUrl)) { MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(), MediaArtwork.MediaArtworkType.THUMB); ma.setPreviewUrl(posterUrl); ma.setDefaultUrl(posterUrl); ep.artwork.add(ma); } } episodes.add(ep); } catch (Exception e) { LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage()); } } } } return true; }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); if (options.getType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getType()); }/*from w w w . ja va 2s . com*/ // we have 3 entry points here // a) getMetadata has been called with an ofdbId // b) getMetadata has been called with an imdbId // c) getMetadata has been called from a previous search String detailUrl = ""; // case a) and c) if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) { if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) { detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId()); } else { detailUrl = options.getResult().getUrl(); } } // case b) if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) { MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE); searchOptions.setImdbId(options.getId(MediaMetadata.IMDB)); try { List<MediaSearchResult> results = search(searchOptions); if (results != null && !results.isEmpty()) { options.setResult(results.get(0)); detailUrl = options.getResult().getUrl(); } } catch (Exception e) { LOGGER.warn("failed IMDB search: " + e.getMessage()); } } // we can only work further if we got a search result on ofdb.de if (StringUtils.isBlank(detailUrl)) { throw new Exception("We did not get any useful movie url"); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),"); if (StringUtils.isBlank(ofdbId)) { ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)"); } Url url; try { LOGGER.trace("get details page"); url = new Url(detailUrl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); if (doc.getAllElements().size() < 10) { throw new Exception("meh - we did not receive a valid web page"); } // parse details // IMDB ID "http://www.imdb.com/Title?1194173" el = doc.getElementsByAttributeValueContaining("href", "imdb.com"); if (!el.isEmpty()) { md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)")); } // title / year // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" /> el = doc.getElementsByAttributeValue("property", "og:title"); if (!el.isEmpty()) { String[] ty = parseTitle(el.first().attr("content")); md.setTitle(StrgUtils.removeCommonSortableName(ty[0])); try { md.setYear(Integer.parseInt(ty[1])); } catch (Exception ignored) { } } // another year position if (md.getYear() == 0) { // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a> el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr"); try { md.setYear(Integer.parseInt(el.first().text())); } catch (Exception ignored) { } } // original title (has to be searched with a regexp) // <tr valign="top"> // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif" // size="2">Originaltitel:</font></td> // <td> </td> // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif" // size="2"><b>Brave</b></font></td> // </tr> String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>"); if (!originalTitle.isEmpty()) { md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle)); } // Genre: <a href="view.php?page=genre&Genre=Action">Action</a> el = doc.getElementsByAttributeValueContaining("href", "page=genre"); for (Element g : el) { md.addGenre(getTmmGenre(g.text())); } // rating // <div itemtype="http://schema.org/AggregateRating" itemscope // itemprop="aggregateRating">Note: <span // itemprop="ratingValue">6.73</span><meta // itemprop="worstRating" content="1" /> el = doc.getElementsByAttributeValue("itemprop", "ratingValue"); if (!el.isEmpty()) { String r = el.text(); if (!r.isEmpty()) { try { md.setRating(Float.parseFloat(r)); } catch (Exception e) { LOGGER.debug("could not parse rating"); } } } // get PlotLink; open url and parse // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a> LOGGER.trace("parse plot"); el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,"); if (!el.isEmpty()) { String plotUrl = BASE_URL + "/" + el.first().attr("href"); try { url = new Url(plotUrl); in = url.getInputStream(); Document plot = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements block = plot.getElementsByClass("Blocksatz"); // first // Blocksatz // is plot String p = block.first().text(); // remove all html stuff p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header" md.setPlot(p); } catch (Exception e) { LOGGER.error("failed to get plot page: " + e.getMessage()); } } // http://www.ofdb.de/view.php?page=film_detail&fid=226745 LOGGER.debug("parse actor detail"); String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId; doc = null; try { url = new Url(movieDetail); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get detail page: " + e.getMessage()); } if (doc != null) { parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md); parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER, md); parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md); } } catch (Exception e) { LOGGER.error("Error parsing " + detailUrl); throw e; } return md; }
From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception { LOGGER.debug("search() " + options.toString()); List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>(); String searchUrl = ""; String searchTerm = ""; String imdb = ""; // only title search if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search for everything: " + searchTerm); } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search with title: " + searchTerm); } else {/*w w w . ja v a 2 s . com*/ LOGGER.debug("empty searchString"); return resultList; } searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); Document doc = null; try { Url url = new CachedUrl(searchUrl); InputStream in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(searchUrl); } if (doc == null) { return resultList; } // only look for movie links Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php"); LOGGER.debug("found " + filme.size() + " search results"); if (filme.isEmpty()) { if (!doc.getElementsByTag("title").text().contains("Suche nach")) { // redirected to detail page MediaSearchResult msr = new MediaSearchResult(providerInfo.getId()); Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id="); if (el.size() > 0) { msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)")); } msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim()); el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { msr.setYear(el.get(0).text()); } resultList.add(msr); } return resultList; } // <a // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1" // class="normLight">Avatar - Aufbruch nach Pandora</B> // <nobr>(2009)</nobr><br /><span class="smallLight" // style="color:#ccc;">Avatar</span></a> // map to merge 2 results :/ Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>(); for (Element a : filme) { try { String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-"); MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); if (res.containsKey(id)) { LOGGER.debug("dupe found; merging with previous searchresult"); sr = res.get(id); } if (StringUtils.isNotEmpty(imdb)) { sr.setIMDBId(imdb); } if (StringUtils.isEmpty(sr.getId())) { sr.setId(id); } if (StringUtils.isEmpty(sr.getTitle())) { if (a.html().contains("nobr")) { sr.setTitle(a.ownText()); } else { sr.setTitle(a.text()); } } LOGGER.debug("found movie " + sr.getTitle()); if (StringUtils.isEmpty(sr.getOriginalTitle())) { sr.setOriginalTitle(a.getElementsByTag("span").text()); } if (StringUtils.isEmpty(sr.getYear())) { sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any // 4 // digit } sr.setMediaType(MediaType.MOVIE); sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id); // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), // "images(.*?)\\"")); if (imdb.equals(sr.getIMDBId())) { // perfect match sr.setScore(1); } else { // compare score based on names sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle())); } // populate extra args MetadataUtil.copySearchQueryToSearchResult(options, sr); res.put(id, sr); } catch (Exception e) { LOGGER.warn("error parsing movie result: " + e.getMessage()); } } for (String r : res.keySet()) { resultList.add(res.get(r)); } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }
From source file:org.tomstools.web.service.SolrService.java
private int doGenerateHot(String flag, SolrTools solrTool, Date beginTime, Date endTime) { if (StringUtils.isEmpty(flag)) { return 0; }/*w w w .j av a2 s. c o m*/ String configSelector = userService.getConfig(-1, "SELECTOR_4_HOT_FLAG_" + flag); String url = userService.getConfig(-1, "URL_4_HOT_FLAG_" + flag); String defaultCharsetName = userService.getConfig(-1, "CHARSET_4_HOT_FLAG_" + flag); PageFetcher fetcher = new PageFetcher(defaultCharsetName); String content = fetcher.fetchPageContent(url); Document document = Jsoup.parse(content); Elements nodes = document.select(configSelector); if (null == nodes) { return 0; } List<Word> words = new ArrayList<Word>(); for (int i = 0; i < nodes.size(); i++) { Element e = nodes.get(i); words.add(new Word(e.text(), 100 / (i + 1))); } // ???? for (Word word : words) { List<String> terms = analyzer.getWords(word.word); if (terms.isEmpty()) { continue; } StringBuilder msg = new StringBuilder(); for (int i = 0; i < terms.size(); i++) { if (i != 0) { msg.append(" AND "); } msg.append(terms.get(i)); } try { long cnt = solrTool.count("text:" + msg.toString(), beginTime, endTime); word.heat += cnt; } catch (Exception e) { LOG.error(e.getMessage(), e); } } // ? if (words.size() > 0) { // //siteMapper.saveHotHis(flag); siteMapper.deleteHot(flag); for (Word word : words) { siteMapper.saveHot(flag, word.word, word.heat); } return words.size(); } else { return 0; } }
From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java
private void processSimilarities(@Nullable Element similarityNode, @NotNull EngineQueryResultBuilder engineQueryResultBuilder) { if (similarityNode == null) { LOGGER.warn("Couldn't find similarity node"); return;// www. j av a 2 s. com } Elements sides = similarityNode.getElementsByTag("side"); for (Element side : sides) { Language sideLanguage = Language.getExistingLanguageById(side.attr("lang")); for (Element word : side.getElementsByTag("word")) { String wordText = word.text(); engineQueryResultBuilder.addSimilarRecommendation( new DictionaryObjectBuilder().setLanguage(sideLanguage).setGeneralForm(wordText).build()); } } }
From source file:org.xlrnet.metadict.engines.nobordbok.OrdbokEngine.java
/** * Extract both the general form and all syllables from the oppslagsord-node. *///from w w w .jav a2 s. c o m private void extractGeneralForm(DictionaryObjectBuilder objectBuilder, Element oppslagsord) { String rawForm = oppslagsord.text(); String[] syllabification = StringUtils.split(rawForm, SYLLABLE_SEPARATOR_CHAR); String generalForm = StringUtils.remove(rawForm, SYLLABLE_SEPARATOR_CHAR); objectBuilder.setGeneralForm(generalForm); objectBuilder.setSyllabification(syllabification); }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void findRecommendations(@NotNull Document doc, @NotNull BilingualQueryResultBuilder resultBuilder) { // Determine all candidate nodes: Elements alternativeNodes = doc.select("div.cc > p > *"); Language currentLanguage = null;/*from w w w.j a v a 2 s. c o m*/ for (Element node : alternativeNodes) { // If the next node is a flagicon, try to determine the language for the next entries from the class name if (node.tagName().equals("span") && node.hasClass("flagicon")) { Set<String> classNames = node.classNames(); classNames.remove("flagicon"); for (String className : classNames) { Language candidate = Language.getExistingLanguageById(className); if (candidate != null) { currentLanguage = candidate; break; } } } else if (node.tagName().equals("a")) { String recommendationText = node.text(); DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder(); objectBuilder.setLanguage(currentLanguage).setGeneralForm(recommendationText); resultBuilder.addSimilarRecommendation(objectBuilder.build()); } } }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
@NotNull private DictionaryObject processSingleNode(@NotNull Element element, @NotNull Language language, String queryString) {//from w ww . j a va 2 s .c o m DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder(); objectBuilder.setLanguage(language); // Extract entry text: String context = StringUtils.substringBefore(element.text(), element.getElementsByTag("a").first().text()); String generalForm = context + element.getElementsByTag("a").first().text(); objectBuilder.setGeneralForm(StringUtils.strip(generalForm)); // Extract description: extractDescription(element, queryString, objectBuilder); // Extract gender: extractGender(element, objectBuilder); return objectBuilder.build(); }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void extractDescription(@NotNull Element element, String queryString, DictionaryObjectBuilder objectBuilder) { Element descriptionNode = element.getElementsByClass(CLASS_DESCRIPTION).first(); if (descriptionNode == null) { // Try to detect the description node with an alternative class (necessary for synonyms) descriptionNode = element.getElementsByClass(CLASS_EXTRA_INFO).first(); }/*from w w w .j a v a2 s.c o m*/ if (descriptionNode != null) { String description = descriptionNode.text(); description = StringUtils.removeStart(description, DESCRIPTION_BEGIN); description = StringUtils.removeEnd(description, DESCRIPTION_END); if (!StringUtils.equalsIgnoreCase(description, queryString)) // Set description only if it is different from request string objectBuilder.setDescription(StringUtils.strip(description)); } }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void extractGender(@NotNull Element element, DictionaryObjectBuilder objectBuilder) { Element genderNode = element.getElementsByClass(CLASS_GENDER).first(); if (genderNode != null) { String gender = genderNode.text(); if (GENDER_MAP.containsKey(gender)) objectBuilder.setGrammaticalGender(GENDER_MAP.get(gender)); }/* w w w. j a v a 2 s .co m*/ }