List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:org.seo.rank.impl.BaiduCopyChecker.java
private Set<String> doCheck(String url, Article article) { Set<String> data = new HashSet<>(); try {/*from w ww .j a v a 2 s . co m*/ Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Host", HOST) .header("Referer", REFERER).header("User-Agent", USER_AGENT).get(); String titleCssQuery = "html body div div div div div h3.t a"; Elements elements = document.select(titleCssQuery); int i = 0; for (Element element : elements) { String _title = element.text(); if (StringUtils.isBlank(_title)) { continue; } i++; LOGGER.debug(i + ":" + _title); if (_title.contains("") || !contains(_title, article.getTitle())) { LOGGER.debug("?"); continue; } String href = element.attr("href"); href = UrlTools.normalizeUrl(url, href); String realUrl = urlConvert(href); LOGGER.debug("url:" + url); LOGGER.debug("realUrl:" + realUrl); String[] target = new URL(realUrl).getHost().split("\\."); String[] source = new URL(article.getUrl()).getHost().split("\\."); if (target.length > 1 && source.length > 1 && !(target[target.length - 2] + target[target.length - 1]) .equals(source[source.length - 2] + source[source.length - 1])) { data.add(realUrl); } } } catch (Exception ex) { LOGGER.error("?", ex); } return data; }
From source file:org.seo.rank.impl.BaiduRanker.java
/** * ?/*from ww w. java 2 s .co m*/ * @param rank */ private void searchBaiduIndex(Rank rank) { String url = "url:" + rank.getUrl(); url = "http://www.baidu.com/s?wd=" + url; LOGGER.debug(url); try { Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION) .header("User-Agent", USER_AGENT).header("Host", HOST).get(); String notFoundCssQuery = "html body div div div div div p"; Elements elements = document.select(notFoundCssQuery); for (Element element : elements) { String text = element.text(); if (text.contains("") && text.contains("")) { // LOGGER.debug(""); rank.setIndex(false); return; } } String numberCssQuery = "html body div div div div.nums"; elements = document.select(numberCssQuery); for (Element element : elements) { String text = element.text(); if (text.equals("1")) { // LOGGER.debug(""); rank.setIndex(true); return; } } } catch (IOException ex) { LOGGER.error("?", ex); } LOGGER.debug(""); }
From source file:org.seo.rank.impl.BaiduRanker.java
/** * ??// w ww.j av a2s .c o m * @param url URL * @param rank ?? * @return */ private int searchBaiduRank(String url, Rank rank) { String targetUrl = rank.getUrl(); try { Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Host", HOST) .header("Referer", REFERER).header("User-Agent", USER_AGENT).get(); String titleCssQuery = "html body div div div div div h3.t a"; Elements elements = document.select(titleCssQuery); int i = 0; for (Element element : elements) { String title = element.text(); if (StringUtils.isBlank(title)) { continue; } i++; LOGGER.debug(i + ":" + title); if (!title.contains(rank.getKeyword())) { LOGGER.debug("???"); continue; } String href = element.attr("href"); href = UrlTools.normalizeUrl(url, href); String realUrl = urlConvert(href); LOGGER.debug("url:" + url); LOGGER.debug("realUrl:" + realUrl); LOGGER.debug("targetUrl:" + targetUrl); if (targetUrl.equals(realUrl)) { return i; } } } catch (Exception ex) { LOGGER.error("?", ex); } return -1; }
From source file:org.seo.rank.list.impl.DefaultParser.java
@Override public List<Article> parse(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery) { List<Article> articles = new ArrayList<>(); try {/*from w w w . j a v a 2 s . c o m*/ Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION) .header("User-Agent", USER_AGENT).get(); Elements elements = document.select(titleCssQuery); for (Element element : elements) { String title = element.text(); String href = element.attr("href"); if (!StringUtils.isBlank(title) && !StringUtils.isBlank(href)) { href = UrlTools.normalizeUrl(url, href); Article article = new Article(); article.setTitle(title); article.setUrl(href); articles.add(article); } else { LOGGER.info("?" + url + " title:" + title + ", href:" + href); } } //?? String nextPageUrl = getNextPageUrl(document, nextPageCssQuery, nextPageText); LOGGER.debug("" + nextPageUrl); if (nextPageUrl != null) { nextPageUrl = UrlTools.normalizeUrl(url, nextPageUrl); LOGGER.debug("?" + nextPageUrl); //? List<Article> result = parse(nextPageUrl, nextPageCssQuery, nextPageText, titleCssQuery); articles.addAll(result); } else { LOGGER.info("??" + url); } } catch (Exception e) { LOGGER.error("?" + url, e); } return articles; }
From source file:org.seo.rank.list.impl.DefaultParser.java
/** * ??/* ww w . j a va2 s . co m*/ * @param document * @param nextPageCssQuery ?CSS * @param nextPageText CSS * @return ? */ private String getNextPageUrl(Document document, String nextPageCssQuery, String nextPageText) { Elements elements = document.select(nextPageCssQuery); for (Element element : elements) { String text = element.text(); LOGGER.debug(text); if (text != null && nextPageText.trim().equals(text.trim())) { String href = element.attr("href"); return href; } } return null; }
From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java
/** * Search for movies at aebn.net./*from ww w .j av a 2s.co m*/ * */ @Override public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { LOGGER.debug("AEBN: search() {}", query); List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>(); Elements movies = null; String searchString = ""; // Search for query if (StringUtils.isNotEmpty(query.get(MediaSearchOptions.SearchParam.QUERY))) { searchString = query.get(MediaSearchOptions.SearchParam.QUERY); } // Search String searchUrl = BASE_DATAURL + "/dispatcher/fts?userQuery=" + URLEncoder.encode(cleanSearchQuery(searchString), "UTF-8") + "&targetSearchMode=basic&isAdvancedSearch=true&isFlushAdvancedSearchCriteria=false" + "&count=" + SEARCH_COUNT.toString() + "&imageType=Large&sortType=Relevance"; try { LOGGER.info("========= BEGIN AEBN Scraper Search for: {}", searchString); Url url = new Url(searchUrl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); // only look for movie links like // <a id="FTSMovieSearch_link_title_detail_30" ... </a> movies = doc.getElementsByAttributeValueMatching("id", "FTSMovieSearch_link_title_detail_\\d+"); LOGGER.debug("AEBN: found {} search results", movies.size()); } catch (Exception e) { LOGGER.error("AEBN: failed to search for {}: ", searchString, e); } if (movies == null || movies.isEmpty()) { LOGGER.debug("AEBN: no movie found"); return resultList; } // there are search results, so fill media data structure HashSet<String> foundResultUrls = new HashSet<String>(); for (Element anchor : movies) { try { String movieUrl = BASE_DATAURL + StrgUtils.substr(anchor.toString(), "href=\\\"(.*?)\\\""); String movieId = StrgUtils.substr(anchor.toString(), "movieId=(\\d+)"); String movieName = StringEscapeUtils.unescapeHtml4(anchor.text()); String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + movieId + "_160w.jpg"; LOGGER.debug("AEBN: found movie {} (id{})", movieName, movieId); // check if it is a valid AEBN id if (!isValidAebnId(Integer.parseInt(movieId))) { LOGGER.error("AEBN: id({}) is not a valid aebn id", movieId); } MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setId(movieId); sr.setIMDBId(""); sr.setTitle(movieName); sr.setOriginalTitle(movieName); // sr.setYear not possible, no data at this point sr.setYear(null); sr.setMediaType(MediaType.MOVIE); sr.setUrl(movieUrl); sr.setPosterUrl(posterUrl); // compare score based on names float score = MetadataUtil.calculateScore(searchString, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { LOGGER.debug("AEBN: no poster - downgrading score by 0.01"); score = score - 0.01f; } sr.setScore(score); // check if result has at least a title and id if (StringUtils.isBlank(sr.getTitle()) || StringUtils.isBlank(sr.getId())) { LOGGER.warn("AEBN: no title nor id, skipping"); continue; } // check if the movie has been already added to the search results if (foundResultUrls.contains(sr.getUrl())) { continue; } foundResultUrls.add(sr.getUrl()); // populate extra arguments (deprecated) // MetadataUtil.copySearchQueryToSearchResult(query, sr); resultList.add(sr); } catch (Exception e) { LOGGER.warn("AEBN: error parsing search result: {}", e); } } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }
From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java
/** * Get movie meta data from aebn.net.//from w ww .j a va2 s .co m * */ @Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("AEBN: getMetadata() {}", options); // check if there is already meta data present in the result if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) { LOGGER.debug("AEBN: return metadata from cache"); return options.getResult().getMediaMetadata(); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); Elements elements = null; Element element = null; Integer aebnId = 0; // get AebnId from previous search result if ((options.getResult() != null) && (options.getResult().getId() != null)) { aebnId = Integer.parseInt(options.getResult().getId()); LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId); // preset some values from search result (if there is one) // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy". md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, StrgUtils.removeCommonSortableName(options.getResult().getTitle())); } // or get AebnId from options if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) { LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID)); aebnId = Integer.parseInt(options.getId(AEBNID)); } if (!isValidAebnId(aebnId)) { LOGGER.warn("AEBN: no or incorrect aebnId, aborting"); return md; } // ID md.setId(providerInfo.getId(), aebnId); LOGGER.debug("AEBN: aebnId({})", aebnId); // Base download url for data scraping String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId; String locale = options.getLanguage().name(); if (!StringUtils.isBlank(locale)) { downloadUrl = downloadUrl + "&locale=" + locale; LOGGER.debug("AEBN: used locale({})", locale); } // begin download and scrape try { LOGGER.debug("AEBN: download movie detail page"); Url url = new Url(downloadUrl); InputStream in = url.getInputStream(); Document document = Jsoup.parse(in, "UTF-8", ""); in.close(); // Title // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1> LOGGER.debug("AEBN: parse title"); elements = document.getElementsByAttributeValue("class", "md-movieTitle"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieTitle = cleanString(element.text()); LOGGER.debug("AEBN: title({})", movieTitle); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // Poster // front cover: // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg"; md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl); // Fanart/Background // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..." // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." /> LOGGER.debug("AEBN: parse fanart / scene thumbs"); elements = document.getElementsByAttributeValue("class", "SceneThumbnail"); LOGGER.debug("AEBN: {} elements found", elements.size()); int i = 1; for (Element anchor : elements) { String backgroundUrl = anchor.attr("src"); LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl); md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl); i++; } // Runtime LOGGER.debug("AEBN: parse runtime"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieRuntime = cleanString(element.attr("content")); movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M"); LOGGER.debug("AEBN: runtime({})", movieRuntime); md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime); } // Year LOGGER.debug("AEBN: parse year"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieYear = cleanString(element.attr("content")); movieYear = StrgUtils.substr(movieYear, "(\\d+)-"); LOGGER.debug("AEBN: year({})", movieYear); md.storeMetadata(MediaMetadata.YEAR, movieYear); } // Series (Collection) LOGGER.debug("AEBN: parse collection"); elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieCollection = cleanString(element.text()); // Fake a TMDB_SET based on the hash value of the collection name int movieCollectionHash = movieCollection.hashCode(); md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection); md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash); LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash); } // Studio LOGGER.debug("AEBN: parse studio"); elements = document.getElementsByAttributeValue("id", "md-details") .select("[itemprop=productionCompany]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String movieStudio = cleanString(elements.first().text()); LOGGER.debug("AEBN: studio({})", movieStudio); md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio); } // Genre LOGGER.debug("AEBN: parse genre"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]"); for (Element g : elements) { md.addGenre(getTmmGenre(g.text())); } // add basic genre, since all genres at AEBN could be summarised // into this one md.addGenre(MediaGenres.EROTIC); // Certification // no data scrapeable---but obviously it's adult only, so simply // generate it String movieCertification = null; Certification certification = null; String country = options.getCountry().getAlpha2(); LOGGER.debug("AEBN: generate certification for {}", country); // @formatter:off if (country.equals("DE")) { movieCertification = "FSK 18"; } if (country.equals("US")) { movieCertification = "NC-17"; } if (country.equals("GB")) { movieCertification = "R18"; } if (country.equals("FR")) { movieCertification = "18"; } if (country.equals("ES")) { movieCertification = "PX"; } if (country.equals("JP")) { movieCertification = "R18+"; } if (country.equals("IT")) { movieCertification = "V.M.18"; } if (country.equals("NL")) { movieCertification = "16"; } // @formatter:on certification = Certification.getCertification(options.getCountry(), movieCertification); if (certification != null) { LOGGER.debug("AEBN: certification({})", certification); md.addCertification(certification); } // Plot and Tagline LOGGER.debug("AEBN: parse plot"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String moviePlot = cleanString(elements.first().text()); md.storeMetadata(MediaMetadata.PLOT, moviePlot); // no separate tagline available, so extract the first sentence // from the movie plot String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])"); LOGGER.debug("AEBN: tagline(" + movieTagline + ")"); md.storeMetadata(MediaMetadata.TAGLINE, movieTagline); } // Actors LOGGER.debug("AEBN: parse actors"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]"); LOGGER.debug("AEBN: {} actors found", elements.size()); for (Element anchor : elements) { String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)"); String actorname = cleanString(anchor.select("[itemprop=name]").first().text()); String actordetailsurl = BASE_DATAURL + anchor.attr("href"); if (!actorname.isEmpty()) { LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname, actordetailsurl); MediaCastMember cm = new MediaCastMember(); cm.setType(MediaCastMember.CastType.ACTOR); cm.setName(actorname); if (!actorid.isEmpty()) { cm.setId(actorid); } // Actor detail page try { Url starurl = new Url(actordetailsurl); InputStream starurlstream = starurl.getInputStream(); Document stardocument = Jsoup.parse(starurlstream, "UTF-8", ""); starurlstream.close(); Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo"); if (elements2.size() == 0) { LOGGER.debug("AEBN: no additional actor details found"); } else { // Actor image String actorimage = elements2.select("[itemprop=image]").first().attr("src"); LOGGER.debug("AEBN: actor image({})", actorimage); if (!actorimage.isEmpty()) { cm.setImageUrl(actorimage); } // Actor 'fanart' images // unsure if this is ever shown in tmm elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery") .select("a"); LOGGER.debug("AEBN: {} gallery images found", elements2.size()); for (Element thumbnail : elements2) { LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href")); cm.addFanart(thumbnail.attr("href")); } } } catch (Exception e) { LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e); } md.addCastMember(cm); } } // Director LOGGER.debug("AEBN: parse director"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)"); String directorname = cleanString(elements.select("[itemprop=name]").first().text()); if (!directorname.isEmpty()) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(directorname); if (!directorid.isEmpty()) { cm.setId(directorid); } cm.setImageUrl(""); md.addCastMember(cm); LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname); } } // Original Title // if we have no original title, just copy the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } } catch (Exception e) { LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e); } return md; }
From source file:org.tinymediamanager.scraper.anidb.AniDBMetadataProvider.java
private void getActors(MediaMetadata md, Element e) { for (Element character : e.children()) { MediaCastMember member = new MediaCastMember(CastType.ACTOR); for (Element characterInfo : character.children()) { if ("name".equalsIgnoreCase(characterInfo.tagName())) { member.setCharacter(characterInfo.text()); }/*from ww w. jav a2s . co m*/ if ("seiyuu".equalsIgnoreCase(characterInfo.tagName())) { member.setName(characterInfo.text()); String image = characterInfo.attr("picture"); if (StringUtils.isNotBlank(image)) { member.setImageUrl("http://img7.anidb.net/pics/anime/" + image); } } } md.addCastMember(member); } }
From source file:org.tinymediamanager.scraper.anidb.AniDBMetadataProvider.java
private void getRating(MediaMetadata md, Element e) { for (Element rating : e.children()) { if ("temporary".equalsIgnoreCase(rating.tagName())) { try { md.storeMetadata(MediaMetadata.RATING, Float.parseFloat(rating.text())); md.storeMetadata(MediaMetadata.VOTE_COUNT, Integer.parseInt(rating.attr("count"))); break; } catch (NumberFormatException ex) { }/* w w w. ja va2s . c o m*/ } } }
From source file:org.tinymediamanager.scraper.anidb.AniDBMetadataProvider.java
private void parseTitle(MediaMetadata md, String langu, Element e) { String titleEN = ""; String titleScraperLangu = ""; String titleFirst = ""; for (Element title : e.children()) { // store first title if neither the requested one nor the english one available if (StringUtils.isBlank(titleFirst)) { titleFirst = title.text(); }/*from ww w . j a v a 2 s .co m*/ // store the english one for fallback if ("en".equalsIgnoreCase(title.attr("xml:lang"))) { titleEN = title.text(); } // search for the requested one if (langu.equalsIgnoreCase(title.attr("xml:lang"))) { titleScraperLangu = title.text(); } } if (StringUtils.isNotBlank(titleScraperLangu)) { md.storeMetadata(MediaMetadata.TITLE, titleScraperLangu); } else if (StringUtils.isNotBlank(titleEN)) { md.storeMetadata(MediaMetadata.TITLE, titleEN); } else { md.storeMetadata(MediaMetadata.TITLE, titleFirst); } }