List of usage examples for org.jsoup.nodes Element tagName
public String tagName()
From source file:org.opens.tanaguru.rules.rgaa30.Rgaa30Rule110102.java
/** * This method linked each label which have an input child on a page to its * form in a map.// w w w.j a v a 2s .c om */ private void putLabelElementHandlerIntoTheMap() { for (Element el : labelElementHandler.get()) { Element tmpElement = el.parent(); while (StringUtils.isNotBlank(tmpElement.tagName())) { if (tmpElement.tagName().equals(FORM_TAG)) { if (labelFormMap.containsKey(tmpElement)) { Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY); if (!els.isEmpty()) { labelFormMap.get(tmpElement).add(el); } } else { Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY); if (!els.isEmpty()) { ElementHandler<Element> labelElement = new ElementHandlerImpl(); labelElement.add(el); labelFormMap.put(tmpElement, labelElement); } } break; } tmpElement = tmpElement.parent(); } } }
From source file:org.opens.tanaguru.rules.rgaa30.Rgaa30Rule110103.java
/** * This method linked each input on a page to its form in a map. *///from w ww .j a v a 2s .c o m private void putInputElementHandlerIntoTheMap() { for (Element el : inputElementHandler.get()) { Element tmpElement = el.parent(); while (StringUtils.isNotBlank(tmpElement.tagName())) { if (tmpElement.tagName().equals(FORM_TAG)) { if (inputFormMap.containsKey(tmpElement)) { inputFormMap.get(tmpElement).add(el); } else { ElementHandler<Element> inputElement = new ElementHandlerImpl(); inputElement.add(el); inputFormMap.put(tmpElement, inputElement); } break; } tmpElement = tmpElement.parent(); } } }
From source file:org.sbs.goodcrawler.extractor.selector.action.string.ActionFactory.java
public static SelectorAction create(Element element, String c) { if ("string".equals(c)) { StringActionType $type = EnumUtils.getEnum(StringActionType.class, element.attr("operation")); if (null == $type) { try { throw new Exception( "?" + element.tagName() + "operation"); } catch (Exception e) { e.printStackTrace();// w w w . j av a2 s .c om } } switch ($type) { case after: return new StringAfterAction(element.attr("split")); case afterLast: return new StringAfterLastAction(element.attr("split")); case before: return new StringBeforeAction(element.attr("split")); case beforeLast: return new StringBeforeLastAction(element.attr("split")); case between: return new StringBetweenAction(element.attr("exp")); case filter: return new StringFilterAction(element.attr("filter"), element.attr("charType")); case replace: return new StringReplaceAction(element.attr("search"), element.attr("replacement")); case split: return new StringSplitAction(element.attr("split"), element.attr("index")); case sub: return new StringSubAction(element.attr("exp")); case suffix: return new StringSuffixAction(element.attr("suffix")); case perfix: return new StringPerfixAction(element.attr("perfix")); default: break; } } else if ("integer".equals(c) || "int".equals(c)) { IntegerActionType $type = EnumUtils.getEnum(IntegerActionType.class, element.attr("operation")); switch ($type) { case abs: return new IntegerAbsAction(); case between: try { return new IntegerBetweenAction(element.attr("exp"), element.attr("default")); } catch (IntegerBetweenExpressionException e) { e.printStackTrace(); } default: break; } } else if ("date".equals(c)) { } else if ("numerica".equals(c)) { IntegerActionType $type = EnumUtils.getEnum(IntegerActionType.class, element.attr("operation")); switch ($type) { case abs: return new IntegerAbsAction(); case between: try { return new IntegerBetweenAction(element.attr("exp"), element.attr("default")); } catch (IntegerBetweenExpressionException e) { e.printStackTrace(); } default: break; } } else if ("file".equals(c)) { FileActionType $type = EnumUtils.getEnum(FileActionType.class, element.attr("operation")); switch ($type) { case download: String dir = element.attr("dir"); String temp = element.attr("fileName"); boolean md5File = false, asyn; if (StringUtils.isNotBlank(temp)) { if ("{md5}".equals(temp)) { md5File = true; } } else md5File = true; temp = element.attr("asyn"); if (StringUtils.isNotBlank(temp)) { asyn = Boolean.parseBoolean(temp); } else { asyn = true; } return new DownLoadFileAction(dir, md5File, asyn); case download_resize: String dir2 = element.attr("dir"); String temp2 = element.attr("fileName"); boolean md5File2 = false, asyn2; if (StringUtils.isNotBlank(temp2)) { if ("{md5}".equals(temp2)) { md5File2 = true; } } else md5File2 = true; temp2 = element.attr("asyn"); if (StringUtils.isNotBlank(temp2)) { asyn2 = Boolean.parseBoolean(temp2); } else { asyn2 = true; } DownLoadImageResizeAction resizeAction = new DownLoadImageResizeAction(dir2, md5File2, asyn2); temp2 = element.attr("width"); if (StringUtils.isNotBlank(temp2)) { resizeAction.setW(Integer.parseInt(temp2)); } temp2 = element.attr("height"); if (StringUtils.isNotBlank(temp2)) { resizeAction.setH(Integer.parseInt(temp2)); } temp2 = element.attr("quality"); if (StringUtils.isNotBlank(temp2)) { resizeAction.setQuality(Float.parseFloat(temp2)); } temp2 = element.attr("del"); if (StringUtils.isNotBlank(temp2)) { resizeAction.setDeleteOldFile(Boolean.parseBoolean(temp2)); } return resizeAction; default: break; } } else { StringActionType $type = EnumUtils.getEnum(StringActionType.class, element.attr("operation")); if (null == $type) { try { throw new Exception( "?" + element.tagName() + "operation"); } catch (Exception e) { e.printStackTrace(); } } switch ($type) { case after: return new StringAfterAction(element.attr("split")); case afterLast: return new StringAfterLastAction(element.attr("split")); case before: return new StringBeforeAction(element.attr("split")); case beforeLast: return new StringBeforeLastAction(element.attr("split")); case between: return new StringBetweenAction(element.attr("exp")); case filter: return new StringFilterAction(element.attr("filter"), element.attr("charType")); case replace: return new StringReplaceAction(element.attr("search"), element.attr("replacement")); case split: return new StringSplitAction(element.attr("split"), element.attr("index")); case sub: return new StringSubAction(element.attr("exp")); case suffix: return new StringSuffixAction(element.attr("suffix")); case perfix: return new StringPerfixAction(element.attr("perfix")); default: break; } } return null; }
From source file:org.sbs.goodcrawler.extractor.selector.factory.ElementCssSelectorFactory.java
/** * <b>Element??Element??select/*from ww w .j a va 2 s . co m*/ * @param element * @return */ @SuppressWarnings("rawtypes") public static AbstractElementCssSelector create(Element element) { String name = element.attr("name"); String value = element.attr("value"); String type = element.attr("type"); String attr = element.attr("attr"); String pattern = element.attr("pattern"); String regex = element.attr("regex"); String required = element.attr("required"); String sIndex = element.attr("index"); boolean isRequired = false; if (StringUtils.isNotBlank(required)) { isRequired = Boolean.parseBoolean(required); } int index = 0; if (StringUtils.isNotBlank(sIndex)) { index = Integer.parseInt(sIndex); } AbstractElementCssSelector selector = ElementCssSelectorFactory.create(name, type, value, attr, isRequired, index, regex, pattern); // ? Elements children = element.children(); for (Element e : children) { if ("action".equals(e.tagName())) { SelectorAction action = ActionFactory.create(e, element.attr("type")); if (action != null) selector.addAction(action); } // ?Url else if ("element".equals(e.tagName())) { ((PageElementSelector) selector).addSelector(create(e)); } } return selector; }
From source file:org.sbs.goodcrawler.jobconf.ExtractConfig.java
/** * ????/* ww w. ja v a 2 s. co m*/ * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) throws ConfigurationException { Elements extractElement = doc.select("extract"); super.jobName = doc.select("job").attr("name"); super.indexName = doc.select("job").attr("indexName"); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } this.templates.add(extractTemplate); } return this; }
From source file:org.tinymediamanager.scraper.anidb.AniDBMetadataProvider.java
private void getActors(MediaMetadata md, Element e) { for (Element character : e.children()) { MediaCastMember member = new MediaCastMember(CastType.ACTOR); for (Element characterInfo : character.children()) { if ("name".equalsIgnoreCase(characterInfo.tagName())) { member.setCharacter(characterInfo.text()); }/*from w w w .j av a 2s . c om*/ if ("seiyuu".equalsIgnoreCase(characterInfo.tagName())) { member.setName(characterInfo.text()); String image = characterInfo.attr("picture"); if (StringUtils.isNotBlank(image)) { member.setImageUrl("http://img7.anidb.net/pics/anime/" + image); } } } md.addCastMember(member); } }
From source file:org.tinymediamanager.scraper.anidb.AniDBMetadataProvider.java
private void getRating(MediaMetadata md, Element e) { for (Element rating : e.children()) { if ("temporary".equalsIgnoreCase(rating.tagName())) { try { md.storeMetadata(MediaMetadata.RATING, Float.parseFloat(rating.text())); md.storeMetadata(MediaMetadata.VOTE_COUNT, Integer.parseInt(rating.attr("count"))); break; } catch (NumberFormatException ex) { }/*from w w w .j a va2 s . c o m*/ } } }
From source file:org.tinymediamanager.scraper.anidb.AniDBMetadataProvider.java
private List<Episode> parseEpisodes(Document doc) { List<Episode> episodes = new ArrayList<Episode>(); Element anime = doc.child(0); Element eps = null;/*from w w w. j a v a 2 s .c o m*/ // find the "episodes" child for (Element e : anime.children()) { if ("episodes".equalsIgnoreCase(e.tagName())) { eps = e; break; } } if (eps == null) { return episodes; } for (Element e : eps.children()) { // filter out the desired episode if ("episode".equals(e.tagName())) { Episode episode = new Episode(); try { episode.id = Integer.parseInt(e.attr("id")); } catch (NumberFormatException ex) { } for (Element episodeInfo : e.children()) { if ("epno".equalsIgnoreCase(episodeInfo.tagName())) { try { episode.episode = Integer.parseInt(episodeInfo.text()); // looks like anidb is storing anything in a single season, so put 1 to season, if type = 1 if ("1".equals(episodeInfo.attr("type"))) { episode.season = 1; } else { // else - we see them as "specials" episode.season = 0; } } catch (NumberFormatException ex) { } continue; } if ("length".equalsIgnoreCase(episodeInfo.tagName())) { try { episode.runtime = Integer.parseInt(episodeInfo.text()); } catch (NumberFormatException ex) { } continue; } if ("airdate".equalsIgnoreCase(episodeInfo.tagName())) { episode.airdate = episodeInfo.text(); continue; } if ("rating".equalsIgnoreCase(episodeInfo.tagName())) { try { episode.rating = Float.parseFloat(episodeInfo.text()); } catch (NumberFormatException ex) { } continue; } if ("title".equalsIgnoreCase(episodeInfo.tagName())) { try { episode.titles.put(episodeInfo.attr("xml:lang").toLowerCase(), episodeInfo.text()); } catch (Exception ex) { } continue; } if ("summary".equalsIgnoreCase(episodeInfo.tagName())) { episode.summary = episodeInfo.text(); continue; } } episodes.add(episode); } } return episodes; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { LOGGER.debug("search() " + query.toString()); /*//from ww w .j av a 2 s .c o m * IMDb matches seem to come in several "flavours". * * Firstly, if there is one exact match it returns the matching IMDb page. * * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results) * * We should check the Exact match section first, then the poplar titles and finally the partial matches. * * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek" */ Pattern imdbIdPattern = Pattern.compile("/title/(tt[0-9]{7})/"); List<MediaSearchResult> result = new ArrayList<MediaSearchResult>(); String searchTerm = ""; if (StringUtils.isNotEmpty(query.get(SearchParam.IMDBID))) { searchTerm = query.get(SearchParam.IMDBID); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.get(SearchParam.QUERY); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.get(SearchParam.TITLE); } if (StringUtils.isEmpty(searchTerm)) { return result; } // parse out language and coutry from the scraper options String language = query.get(SearchParam.LANGUAGE); String myear = query.get(SearchParam.YEAR); String country = query.get(SearchParam.COUNTRY); // for passing the country to the scrape searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); StringBuilder sb = new StringBuilder(imdbSite.getSite()); sb.append("find?q="); try { // search site was everytime in UTF-8 sb.append(URLEncoder.encode(searchTerm, "UTF-8")); } catch (UnsupportedEncodingException ex) { // Failed to encode the movie name for some reason! LOGGER.debug("Failed to encode search term: " + searchTerm); sb.append(searchTerm); } // we need to search for all - otherwise we do not find TV movies sb.append(CAT_TITLE); LOGGER.debug("========= BEGIN IMDB Scraper Search for: " + sb.toString()); Document doc; try { CachedUrl url = new CachedUrl(sb.toString()); url.addHeader("Accept-Language", getAcceptLanguage(language, country)); doc = Jsoup.parse(url.getInputStream(), "UTF-8", ""); } catch (Exception e) { LOGGER.debug("tried to fetch search response", e); // clear Cache CachedUrl.removeCachedFileForUrl(sb.toString()); return result; } // check if it was directly redirected to the site Elements elements = doc.getElementsByAttributeValue("rel", "canonical"); for (Element element : elements) { MediaMetadata md = null; // we have been redirected to the movie site String movieName = null; String movieId = null; String href = element.attr("href"); Matcher matcher = imdbIdPattern.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // get full information if (!StringUtils.isEmpty(movieId)) { MediaScrapeOptions options = new MediaScrapeOptions(); options.setImdbId(movieId); options.setLanguage(MediaLanguages.valueOf(language)); options.setCountry(CountryCode.valueOf(country)); options.setScrapeCollectionInfo(Boolean.parseBoolean(query.get(SearchParam.COLLECTION_INFO))); options.setScrapeImdbForeignLanguage( Boolean.parseBoolean(query.get(SearchParam.IMDB_FOREIGN_LANGUAGE))); md = getMetadata(options); if (!StringUtils.isEmpty(md.getStringValue(MediaMetadata.TITLE))) { movieName = md.getStringValue(MediaMetadata.TITLE); } } // if a movie name/id was found - return it if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) { MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(md.getStringValue(MediaMetadata.YEAR)); sr.setMetadata(md); sr.setScore(1); // and parse out the poster String posterUrl = ""; Element td = doc.getElementById("img_primary"); if (td != null) { Elements imgs = td.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } if (StringUtils.isNotBlank(posterUrl)) { sr.setPosterUrl(posterUrl); } result.add(sr); return result; } } // parse results // elements = doc.getElementsByClass("result_text"); elements = doc.getElementsByClass("findResult"); for (Element tr : elements) { // we only want the tr's if (!"tr".equalsIgnoreCase(tr.tagName())) { continue; } // find the id / name String movieName = ""; String movieId = ""; String year = ""; Elements tds = tr.getElementsByClass("result_text"); for (Element element : tds) { // we only want the td's if (!"td".equalsIgnoreCase(element.tagName())) { continue; } // filter out unwanted results Pattern unwanted = Pattern.compile(".*\\((TV Series|TV Episode|Short|Video Game)\\).*"); // stripped out .*\\(Video\\).*| Matcher matcher = unwanted.matcher(element.text()); if (matcher.find()) { continue; } // is there a localized name? (aka) String localizedName = ""; Elements italics = element.getElementsByTag("i"); if (italics.size() > 0) { localizedName = italics.text().replace("\"", ""); } // get the name inside the link Elements anchors = element.getElementsByTag("a"); for (Element a : anchors) { if (StringUtils.isNotEmpty(a.text())) { // movie name if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) { // take AKA as title, but only if not EN movieName = localizedName; } else { movieName = a.text(); } // parse id String href = a.attr("href"); matcher = imdbIdPattern.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // try to parse out the year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); matcher = yearPattern.matcher(element.text()); while (matcher.find()) { if (matcher.group(1) != null) { year = matcher.group(1); break; } } break; } } } // if an id/name was found - parse the poster image String posterUrl = ""; tds = tr.getElementsByClass("primary_photo"); for (Element element : tds) { Elements imgs = element.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } // if no movie name/id was found - continue if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) { continue; } MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(year); sr.setPosterUrl(posterUrl); // populate extra args MetadataUtil.copySearchQueryToSearchResult(query, sr); if (movieId.equals(query.get(SearchParam.IMDBID))) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchTerm, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { LOGGER.debug("no poster - downgrading score by 0.01"); score = score - 0.01f; } if (myear != null && !myear.isEmpty() && !myear.equals("0") && !myear.equals(year)) { LOGGER.debug("parsed year does not match search result year - downgrading score by 0.01"); score = score - 0.01f; } sr.setScore(score); } result.add(sr); // only get 40 results if (result.size() >= 40) { break; } } Collections.sort(result); Collections.reverse(result); return result; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
/** * do the search according to the type/*from w w w . j a va2 s. co m*/ * * @param query * the search params * @return the found results */ protected List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { List<MediaSearchResult> result = new ArrayList<>(); /* * IMDb matches seem to come in several "flavours". * * Firstly, if there is one exact match it returns the matching IMDb page. * * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results) * * We should check the Exact match section first, then the poplar titles and finally the partial matches. * * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek" */ String searchTerm = ""; if (StringUtils.isNotEmpty(query.getImdbId())) { searchTerm = query.getImdbId(); } if (StringUtils.isEmpty(searchTerm)) { searchTerm = query.getQuery(); } if (StringUtils.isEmpty(searchTerm)) { return result; } // parse out language and coutry from the scraper query String language = query.getLanguage().getLanguage(); int myear = query.getYear(); String country = query.getCountry().getAlpha2(); // for passing the country to the scrape searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); StringBuilder sb = new StringBuilder(getImdbSite().getSite()); sb.append("find?q="); try { // search site was everytime in UTF-8 sb.append(URLEncoder.encode(searchTerm, "UTF-8")); } catch (UnsupportedEncodingException ex) { // Failed to encode the movie name for some reason! getLogger().debug("Failed to encode search term: " + searchTerm); sb.append(searchTerm); } // we need to search for all - otherwise we do not find TV movies sb.append(getSearchCategory()); getLogger().debug("========= BEGIN IMDB Scraper Search for: " + sb.toString()); Document doc; try { Url url = new Url(sb.toString()); url.addHeader("Accept-Language", getAcceptLanguage(language, country)); doc = Jsoup.parse(url.getInputStream(), "UTF-8", ""); } catch (Exception e) { getLogger().debug("tried to fetch search response", e); return result; } // check if it was directly redirected to the site Elements elements = doc.getElementsByAttributeValue("rel", "canonical"); for (Element element : elements) { MediaMetadata md = null; // we have been redirected to the movie site String movieName = null; String movieId = null; String href = element.attr("href"); Matcher matcher = IMDB_ID_PATTERN.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // get full information if (!StringUtils.isEmpty(movieId)) { MediaScrapeOptions options = new MediaScrapeOptions(type); options.setImdbId(movieId); options.setLanguage(query.getLanguage()); options.setCountry(CountryCode.valueOf(country)); md = getMetadata(options); if (!StringUtils.isEmpty(md.getTitle())) { movieName = md.getTitle(); } } // if a movie name/id was found - return it if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) { MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(), query.getMediaType()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(md.getYear()); sr.setMetadata(md); sr.setScore(1); // and parse out the poster String posterUrl = ""; Elements posters = doc.getElementsByClass("poster"); if (posters != null && !posters.isEmpty()) { Elements imgs = posters.get(0).getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_"); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } if (StringUtils.isNotBlank(posterUrl)) { sr.setPosterUrl(posterUrl); } result.add(sr); return result; } } // parse results // elements = doc.getElementsByClass("result_text"); elements = doc.getElementsByClass("findResult"); for (Element tr : elements) { // we only want the tr's if (!"tr".equalsIgnoreCase(tr.tagName())) { continue; } // find the id / name String movieName = ""; String movieId = ""; int year = 0; Elements tds = tr.getElementsByClass("result_text"); for (Element element : tds) { // we only want the td's if (!"td".equalsIgnoreCase(element.tagName())) { continue; } // filter out unwanted results Pattern unwantedSearchResultPattern = getUnwantedSearchResultPattern(); if (unwantedSearchResultPattern != null) { Matcher matcher = unwantedSearchResultPattern.matcher(element.text()); if (matcher.find()) { continue; } } // is there a localized name? (aka) String localizedName = ""; Elements italics = element.getElementsByTag("i"); if (italics.size() > 0) { localizedName = italics.text().replace("\"", ""); } // get the name inside the link Elements anchors = element.getElementsByTag("a"); for (Element a : anchors) { if (StringUtils.isNotEmpty(a.text())) { // movie name if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) { // take AKA as title, but only if not EN movieName = localizedName; } else { movieName = a.text(); } // parse id String href = a.attr("href"); Matcher matcher = IMDB_ID_PATTERN.matcher(href); while (matcher.find()) { if (matcher.group(1) != null) { movieId = matcher.group(1); } } // try to parse out the year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); matcher = yearPattern.matcher(element.text()); while (matcher.find()) { if (matcher.group(1) != null) { try { year = Integer.parseInt(matcher.group(1)); break; } catch (Exception ignored) { } } } break; } } } // if an id/name was found - parse the poster image String posterUrl = ""; tds = tr.getElementsByClass("primary_photo"); for (Element element : tds) { Elements imgs = element.getElementsByTag("img"); for (Element img : imgs) { posterUrl = img.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_"); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_"); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); } } // if no movie name/id was found - continue if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) { continue; } MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(), query.getMediaType()); sr.setTitle(movieName); sr.setIMDBId(movieId); sr.setYear(year); sr.setPosterUrl(posterUrl); if (movieId.equals(query.getImdbId())) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchTerm, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { getLogger().debug("no poster - downgrading score by 0.01"); score = score - 0.01f; } if (yearDiffers(myear, year)) { float diff = (float) Math.abs(year - myear) / 100; getLogger() .debug("parsed year does not match search result year - downgrading score by " + diff); score -= diff; } sr.setScore(score); } result.add(sr); // only get 40 results if (result.size() >= 40) { break; } } Collections.sort(result); Collections.reverse(result); return result; }