List of usage examples for org.jsoup.nodes Element getElementsByAttributeValue
public Elements getElementsByAttributeValue(String key, String value)
From source file:com.salsaberries.narchiver.Trawler.java
/** * Logs into the site.//from w w w .j a v a 2s .co m * * @return * @throws TrawlException */ private boolean login() throws TrawlException { --loginAttempts; if (loginAttempts < 0) { logger.error("Warning! Exceeded maximum number of login attempts! Program is now exiting."); throw new TrawlException("Maximum login attempts exceeded."); } logger.info("Attempting to log in at " + baseURL + site.getString("LOGIN_URL")); try { // follow redirects until you get it right HttpRequest httpRequest; HttpMessage httpGet; String url = baseURL + site.getString("LOGIN_URL"); while (true) { httpGet = new HttpMessage(HttpType.GET); httpGet.setUrl(url); httpGet.initializeDefaultHeaders(site); httpGet.addCookieHeaders(cookies); httpRequest = new HttpRequest(httpGet); if (httpRequest.getStatusCode() != 200) { getTempCookies(httpRequest.getHeaders()); // Find the header I want boolean found = false; for (Header h : httpRequest.getHeaders()) { if (h.getName().equals("Location")) { url = h.getValue(); found = true; } } if (!found) { throw new TrawlException("Redirect loop."); } } else { break; } } // Get headers ArrayList<Header> headers = httpRequest.getHeaders(); // Parse the cookies getTempCookies(headers); String body = httpRequest.getHtml(); Document doc = Jsoup.parse(body); Elements logins = doc.getElementsByAttributeValue("action", site.getString("LOGIN_SUBMIT")); if (logins.isEmpty()) { logins = doc.getElementsByAttributeValue("action", site.getString("BASE_URL") + site.getString("LOGIN_SUBMIT")); } if (logins.isEmpty()) { logins = doc.getElementsByAttributeValue("method", "POST"); } if (logins.isEmpty()) { throw new TrawlException("Failed to find login form!"); } if (logins.size() > 1) { logger.warn("Found multiple login forms. Picking the first one..."); } Element login = logins.get(0); // Extract the captcha image if appropriate String captchaResult = ""; if (!site.getString("CAPTCHA").equals("")) { // Download the captcha image HttpMessage getCaptcha = new HttpMessage(HttpType.GET); getCaptcha.setImage(true); if (!site.isNull("CAPTCHA_IMAGE")) { getCaptcha.setUrl(baseURL + site.getString("CAPTCHA_IMAGE")); getCaptcha.initializeDefaultImageHeaders(site); getCaptcha.addHeader(new Header("Referrer", baseURL + site.getString("LOGIN_URL"))); getCaptcha.addCookieHeaders(cookies); // Send it to deathbycaptcha SocketClient client = new SocketClient("njanetos", "2point7182"); HttpRequest image = new HttpRequest(getCaptcha); ByteArrayOutputStream os = new ByteArrayOutputStream(); ImageIO.write(image.getImage(), "png", os); Captcha result = client.decode(os.toByteArray()); captchaResult = result.toString(); } else { // Just try to get the image Elements captchas = login.getElementsByTag("img"); if (captchas.size() != 1) { throw new TrawlException( "Failed to find captcha, but the initialization file says there should be one."); } Element captchaImage = captchas.get(0); // Does it contain base64? if (captchaImage.attr("src").contains("base64")) { String src = captchaImage.attr("src").split(",")[1]; byte image[] = Base64.decodeBase64(src); ByteArrayOutputStream os = new ByteArrayOutputStream(); os.write(image); SocketClient client = new SocketClient("njanetos", "2point7182"); Captcha result = client.decode(os.toByteArray()); captchaResult = result.toString(); } else { if (captchaImage.attr("src").contains(baseURL)) { getCaptcha.setUrl(captchaImage.attr("src")); } else { getCaptcha.setUrl(baseURL + captchaImage.attr("src")); } getCaptcha.initializeDefaultImageHeaders(site); getCaptcha.addHeader(new Header("Referrer", baseURL + site.getString("LOGIN_URL"))); getCaptcha.addCookieHeaders(cookies); // Send it to deathbycaptcha SocketClient client = new SocketClient("njanetos", "2point7182"); HttpRequest image = new HttpRequest(getCaptcha); ByteArrayOutputStream os = new ByteArrayOutputStream(); ImageIO.write(image.getImage(), "png", os); Captcha result = client.decode(os.toByteArray()); captchaResult = result.toString(); } } logger.info("Decoded captcha: " + captchaResult); } // Grab any hidden fields Elements hidden = login.getElementsByAttributeValue("type", "hidden"); // Build the post response HttpMessage httpPost = new HttpMessage(HttpType.POST); httpPost.initializeDefaultHeaders(site); httpPost.addCookieHeaders(cookies); // TODO: Read this from the html! httpPost.setUrl(baseURL + site.getString("LOGIN_SUBMIT")); httpPost.appendContent(site.getString("USERNAME_FIELD"), site.getString("USERNAME")); httpPost.appendContent(site.getString("PASSWORD_FIELD"), site.getString("PASSWORD")); if (!captchaResult.equals("")) { httpPost.appendContent(site.getString("CAPTCHA_FIELD"), captchaResult); } for (int i = 0; i < hidden.size(); ++i) { httpPost.appendContent(hidden.get(i).attr("name"), hidden.get(i).attr("value")); } // Add the submit info Element submit = login.getElementsByAttributeValue("type", "submit").get(0); httpPost.appendContent(submit.attr("name"), submit.attr("value")); // Add the referrer httpPost.addHeader(new Header("Referer", baseURL + site.getString("LOGIN_URL"))); // Log in HttpRequest response = new HttpRequest(httpPost); headers = response.getHeaders(); // Add any relevant cookies getTempCookies(headers); logger.info("Successfully logged in, response code: " + response.getStatusCode()); // Were we redirected? If so, visit the redirection URL before continuing. if (response.getStatusCode() == 302) { // Send a GET request to the redirection URL before continuing. httpGet = new HttpMessage(HttpType.GET); httpGet.initializeDefaultHeaders(site); httpGet.addHeader(new Header("Referer", baseURL + site.getString("LOGIN_URL"))); String redirectionURL = getRedirectionURL(headers); httpGet.setUrl(redirectionURL); httpGet.addCookieHeaders(cookies); httpRequest = new HttpRequest(httpGet); logger.debug("Visited redirected page. Status code " + httpRequest.getStatusCode()); } } catch (ConnectionException | MalformedURLException | ProtocolException ex) { // Did not successfully log in logger.error(ex.getMessage()); return false; } catch (IOException ex) { // Did not successfully log in logger.error(ex.getMessage()); return false; } catch (Exception | InterruptedException ex) { // Did not successfully log in logger.error(ex.getMessage()); return false; } // Did we successfully log in? Then return true. return true; }
From source file:org.loklak.api.search.WeiboUserInfo.java
@Override protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { Query post = RemoteAccess.evaluate(request); // manage DoS if (post.isDoS_blackout()) { response.sendError(503, "your request frequency is too high"); return;// w w w .j a v a 2 s . com } String url = post.get("url", ""); JSONObject obj = new JSONObject(); Document doc = Jsoup.connect(url).get(); Elements infos; infos = doc.getElementsByAttributeValue("class", "li_1 clearfix"); if (infos != null) { Element info; String profile; for (int i = 0; i < infos.size(); i++) { info = infos.get(i); if (info.getElementsByAttributeValueContaining("href", "loc=infblog").size() == 0) { profile = info.getElementsByAttributeValue("class", "pt_detail").first().text().trim(); obj.put("pro", profile); switch (info.getElementsByAttributeValue("class", "pt_title S_txt2").first().text()) { case "Nickname": obj.put("username", profile); break; case "Location": obj.put("Address", profile); break; case "Gender": obj.put("Gender", profile); break; case "??": obj.put("Sexuality", profile.replace("t", "").replace("rn", "")); break; case "": obj.put("Relationship", profile.replace("t", "").replace("rn", "")); break; case "Birthday": obj.put("Birthday", profile); break; case "": obj.put("Blood", profile); break; case "Domain Name": if (info.getElementsByAttributeValueContaining("href", "loc=infdomain").size() != 0) profile = info.select("a").text(); obj.put("Personaldomain", profile); break; case "": obj.put("Profile", profile); break; case "Registration": obj.put("Registertime", profile.replace("t", "").replace("rn", "")); break; case "Email": obj.put("Email", profile); break; case "QQ": obj.put("Qq", profile); break; case "": obj.put("College", profile.replace("t", "").replace("rn", "")); break; case "Tags": obj.put("Tag", profile.replace("t", "").replace("rn", "")); break; } } else { String blogurl = info.select("a").text(); obj.put("Blog", blogurl); } } } //print JSON response.setCharacterEncoding("UTF-8"); PrintWriter sos = response.getWriter(); sos.print(obj.toString(2)); sos.println(); }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMetadata(); }//w ww. j a v a 2 s .c om MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(MediaMetadata.IMDBID, imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>( executor); // worker for imdb request (/combined) (everytime from akas.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/combined"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); Future<Document> futureCombined = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary = null; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureCombined.get(); /* * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // parse title and year Element title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title Elements elements = title.getElementsByTag("h1"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // year elements = title.getElementsByTag("span"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); // search year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); Matcher matcher = yearPattern.matcher(content); while (matcher.find()) { if (matcher.group(1) != null) { String movieYear = matcher.group(1); md.storeMetadata(MediaMetadata.YEAR, movieYear); break; } } } // original title elements = title.getElementsByAttributeValue("class", "title-extra"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); content = content.replaceAll("\\(original title\\)", "").trim(); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content); } } // poster Element poster = doc.getElementById("primary-poster"); if (poster != null) { String posterUrl = poster.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementById("tn15rating"); if (ratingElement != null) { Elements elements = ratingElement.getElementsByClass("starbar-meta"); if (elements.size() > 0) { Element div = elements.get(0); // rating comes in <b> tag Elements b = div.getElementsByTag("b"); if (b.size() == 1) { String ratingAsString = b.text(); Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10"); Matcher matcher = ratingPattern.matcher(ratingAsString); while (matcher.find()) { if (matcher.group(1) != null) { float rating = 0; try { rating = Float.valueOf(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.RATING, rating); break; } } } // count Elements a = div.getElementsByAttributeValue("href", "ratings"); if (a.size() == 1) { String countAsString = a.text().replaceAll("[.,]|votes", "").trim(); int voteCount = 0; try { voteCount = Integer.parseInt(countAsString); } catch (Exception e) { } md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount); } } // top250 elements = ratingElement.getElementsByClass("starbar-special"); if (elements.size() > 0) { Elements a = elements.get(0).getElementsByTag("a"); if (a.size() > 0) { Element anchor = a.get(0); Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(anchor.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { int top250 = 0; try { top250 = Integer.parseInt(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.TOP_250, top250); } } } } } // parse all items coming by <div class="info"> Elements elements = doc.getElementsByClass("info"); for (Element element : elements) { // only parse divs if (!"div".equals(element.tag().getName())) { continue; } // elements with h5 are the titles of the values Elements h5 = element.getElementsByTag("h5"); if (h5.size() > 0) { Element firstH5 = h5.first(); String h5Title = firstH5.text(); // release date /* * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline" * href="/title/tt0114746/releaseinfo" * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a> </div></div> */ if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element releaseDateElement = div.first(); String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", "")); Pattern pattern = Pattern.compile("(.*)\\(.*\\)"); Matcher matcher = pattern.matcher(releaseDate); if (matcher.find()) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy"); Date parsedDate = sdf.parse(matcher.group(1)); sdf = new SimpleDateFormat("dd-MM-yyyy"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate)); } catch (Exception e) { } } } } /* * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline" * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See * more</a> » </div></div> */ // tagline if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*") && !options.isScrapeImdbForeignLanguage()) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.storeMetadata(MediaMetadata.TAGLINE, tagline); } } /* * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick= * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a> » </div> */ // genres are only scraped from akas.imdb.com if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Elements a = div.first().getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/Sections/Genres/.*")) { md.addGenre(getTmmGenre(anchor.ownText())); } } } } // } /* * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div> */ // runtime // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String first = taglineElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.storeMetadata(MediaMetadata.RUNTIME, runtime); } } /* * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div> */ // country if (h5Title.matches("(?i)Country.*")) { Elements a = element.getElementsByTag("a"); String countries = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/country/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String country = matcher.group(1); if (StringUtils.isNotEmpty(countries)) { countries += ", "; } countries += country.toUpperCase(); } } md.storeMetadata(MediaMetadata.COUNTRY, countries); } /* * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div> */ // Spoken languages if (h5Title.matches("(?i)Language.*")) { Elements a = element.getElementsByTag("a"); String spokenLanguages = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/language/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String langu = matcher.group(1); if (StringUtils.isNotEmpty(spokenLanguages)) { spokenLanguages += ", "; } spokenLanguages += langu; } } md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages); } /* * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div> */ // certification // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { // certification for the right country if (anchor.attr("href").matches( "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) { Pattern certificationPattern = Pattern.compile(".*:(.*)"); Matcher matcher = certificationPattern.matcher(anchor.ownText()); Certification certification = null; while (matcher.find()) { if (matcher.group(1) != null) { certification = Certification.getCertification(options.getCountry(), matcher.group(1)); } } if (certification != null) { md.addCertification(certification); break; } } } } } /* * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick= * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div> */ // director if ("director-info".equals(element.id())) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } } /* * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick= * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src= * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick= * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick= * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick= * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table> */ // cast elements = doc.getElementsByClass("cast"); if (elements.size() > 0) { Elements tr = elements.get(0).getElementsByTag("tr"); for (Element row : tr) { Elements td = row.getElementsByTag("td"); MediaCastMember cm = new MediaCastMember(); for (Element column : td) { // actor thumb if (column.hasClass("hs")) { Elements img = column.getElementsByTag("img"); if (img.size() > 0) { String thumbUrl = img.get(0).attr("src"); if (thumbUrl.contains("no_photo.png")) { cm.setImageUrl(""); } else { thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", ""); cm.setImageUrl(thumbUrl); } } } // actor name if (column.hasClass("nm")) { cm.setName(cleanString(column.text())); } // character if (column.hasClass("char")) { cm.setCharacter(cleanString(column.text())); } } if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(CastType.ACTOR); md.addCastMember(cm); } } } Element content = doc.getElementById("tn15content"); if (content != null) { elements = content.getElementsByTag("table"); for (Element table : elements) { // writers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) { Elements anchors = table.getElementsByTag("a"); for (Element anchor : anchors) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.WRITER); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } // producers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { continue; } Elements columns = row.children(); if (columns.size() == 0) { continue; } MediaCastMember cm = new MediaCastMember(CastType.PRODUCER); String name = cleanString(columns.get(0).text()); if (StringUtils.isBlank(name)) { continue; } cm.setName(name); if (columns.size() >= 3) { cm.setPart(cleanString(columns.get(2).text())); } md.addCastMember(cm); } } } } // Production companies elements = doc.getElementsByClass("blackcatheader"); for (Element blackcatheader : elements) { if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) { Elements a = blackcatheader.nextElementSibling().getElementsByTag("a"); StringBuilder productionCompanies = new StringBuilder(); for (Element anchor : a) { if (StringUtils.isNotEmpty(productionCompanies)) { productionCompanies.append(", "); } productionCompanies.append(anchor.ownText()); } md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString()); break; } } /* * plot from /plotsummary */ // build the url doc = null; doc = futurePlotsummary.get(); // imdb.com has another site structure if (imdbSite == ImdbSiteDefinition.IMDB_COM) { Elements zebraList = doc.getElementsByClass("zebraList"); if (zebraList != null && !zebraList.isEmpty()) { Elements odd = zebraList.get(0).getElementsByClass("odd"); if (odd.isEmpty()) { odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even } if (odd.size() > 0) { Elements p = odd.get(0).getElementsByTag("p"); if (p.size() > 0) { String plot = cleanString(p.get(0).ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } } } // } // get data from tmdb? if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { MediaMetadata tmdbMd = futureTmdb.get(); if (options.isScrapeImdbForeignLanguage() && tmdbMd != null && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) { // tmdbid md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID)); // title md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE)); // original title md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE)); // tagline md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE)); // plot md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT)); // collection info md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); } if (options.isScrapeCollectionInfo() && tmdbMd != null) { md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } return md; }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
private void parseCast(Elements el, MediaCastMember.CastType type, MediaMetadata md) { if (el != null && !el.isEmpty()) { Element castEl = null;/*from ww w.j a v a 2s .c om*/ for (Element element : el) { if (!element.tagName().equals("option")) { // we get more, just do not take the optionbox castEl = element; } } if (castEl == null) { LOGGER.debug("meh, no " + type.name() + " found"); return; } // walk up to table TR... while (!((castEl == null) || (castEl.tagName().equalsIgnoreCase("tr")))) { castEl = castEl.parent(); } // ... and take the next table row ^^ Element tr = castEl.nextElementSibling(); if (tr != null) { for (Element a : tr.getElementsByAttributeValue("valign", "middle")) { String act = a.toString(); String aname = StrgUtils.substr(act, "alt=\"(.*?)\""); if (!aname.isEmpty()) { MediaCastMember cm = new MediaCastMember(); cm.setName(aname); String id = StrgUtils.substr(act, "id=(.*?)[^\"]\">"); if (!id.isEmpty()) { cm.setId(id); // thumb // http://www.ofdb.de/thumbnail.php?cover=images%2Fperson%2F7%2F7689.jpg&size=6 // fullsize ;) http://www.ofdb.de/images/person/7/7689.jpg try { String imgurl = URLDecoder .decode(StrgUtils.substr(act, "images%2Fperson%2F(.*?)&size"), "UTF-8"); if (!imgurl.isEmpty()) { imgurl = BASE_URL + "/images/person/" + imgurl; } cm.setImageUrl(imgurl); } catch (Exception e) { } } String arole = StrgUtils.substr(act, "\\.\\.\\. (.*?)</font>").replaceAll("<[^>]*>", ""); cm.setCharacter(arole); cm.setType(type); md.addCastMember(cm); } } } } }
From source file:org.wikipedia.language.TranslationTests.java
private void checkPluralHasOther(String lang, Element elem) { if (elem.getElementsByAttributeValue("quantity", "other").size() <= 0) { final String msg = lang + ":" + elem.attr("name") + " plural is missing 'other'"; L.e(msg);/* ww w. j a va 2s . co m*/ mismatches.append(msg).append("\n"); } }
From source file:wo.trade.SearchPageScraper.java
public List<TradeItem> parse() { List<TradeItem> tradeItems = new LinkedList<>(); Document doc = Jsoup.parse(page, "UTF-8"); Element content = doc.getElementById("content"); Elements items = null;/*from ww w . j a v a 2 s . com*/ if (content == null) { items = doc.getElementsByClass("item"); } else { items = content.getElementsByClass("item"); } for (Element element : items) { TradeItem item = new TradeItem(); item.id = element.attr("id"); item.id = StringUtils.remove(item.id, "item-container-"); item.seller = element.attr("data-seller"); item.thread = element.attr("data-thread"); item.sellerid = element.attr("data-sellerid"); item.buyout = element.attr("data-buyout"); item.ign = element.attr("data-ign"); item.league = element.attr("data-league"); item.name = element.attr("data-name"); item.corrupted = element.getElementsByClass("corrupted").size() > 0; item.identified = element.getElementsByClass("item-unid").size() == 0; // System.out.println(String.format("Now parsing item id %s name %s", item.id, item.name)); Element sockElem = element.getElementsByClass("sockets-raw").get(0); item.socketsRaw = sockElem.text(); Elements accntAgeElement = element.getElementsByAttributeValue("title", "account age and highest level"); if (accntAgeElement != null && !accntAgeElement.isEmpty()) { item.ageAndHighLvl = accntAgeElement.get(0).text(); } // ----- Requirements ----- // Element reqElem = element.getElementsByClass("requirements").get(0); List<TextNode> reqNodes = reqElem.textNodes(); for (TextNode reqNode : reqNodes) { // sample [ Level: 37 , Strength: 42 , Intelligence: 42 ] String req = StringUtils.trimToEmpty(reqNode.getWholeText()); req = req.replaceAll(regex_horizontal_whitespace, ""); req = Util.removeThoseDamnWhiteSpace(req); String separator = ":"; String reqType = trim(substringBefore(req, separator)); switch (reqType) { case "Level": item.reqLvl = trim(substringAfter(req, separator)); break; case "Strength": item.reqStr = trim(substringAfter(req, separator)); break; case "Intelligence": item.reqInt = trim(substringAfter(req, separator)); break; case "Dexterity": item.reqDex = trim(substringAfter(req, separator)); break; } } item.mapQuantity = element.getElementsByAttributeValue("data-name", "mapq").stream().findFirst() .map(n -> n.text()).map(s -> substringAfter(s, "Item quantity:")) .map(s -> StringUtils.removePattern(s, "[^\\d]")).orElse("") .replaceAll(regex_horizontal_whitespace, "").trim(); // ----- Rarity by checking the item name link class ----- // // itemframe0 - normal // itemframe1 - magic // itemframe2 - rare // itemframe3 - unique // itemframe4 - gems // itemframe5 - currency // itemframe6 - divination card String itemframeStr = element.getElementsByClass("title").stream().findFirst().map(n -> n.attr("class")) .orElse(null); itemframeStr = Util.regexMatch("itemframe(\\d)", itemframeStr, 1); if (itemframeStr != null) { int frame = Integer.parseInt(itemframeStr); item.rarity = Rarity.valueOf(frame); } else { item.rarity = Rarity.unknown; } // ----- Verify ----- // item.dataHash = element.getElementsByAttributeValue("onclick", "verify_modern(this)").stream() .findFirst().map(n -> n.attr("data-hash")).orElse("").trim(); // ----- Mods ----- // Elements itemModsElements = element.getElementsByClass("item-mods"); if (itemModsElements != null && itemModsElements.size() > 0) { Element itemMods = itemModsElements.get(0); if (itemMods.getElementsByClass("bullet-item").size() != 0) { Element bulletItem = itemMods.getElementsByClass("bullet-item").get(0); Elements ulMods = bulletItem.getElementsByTag("ul"); if (ulMods.size() == 2) { // implicit mod Elements implicitLIs = ulMods.get(0).getElementsByTag("li"); Element implicitLi = implicitLIs.last(); Mod impMod = new Mod(implicitLi.attr("data-name"), implicitLi.attr("data-value")); item.implicitMod = impMod; } int indexOfExplicitMods = ulMods.size() - 1; Elements modsLi = ulMods.get(indexOfExplicitMods).getElementsByTag("li"); for (Element modLi : modsLi) { // explicit mods Mod mod = new Mod(modLi.attr("data-name"), modLi.attr("data-value")); item.explicitMods.add(mod); } } } // ----- Properties ----- // // this is the third column data (the first col is the image, second is the mods, reqs) item.quality = element.getElementsByAttributeValue("data-name", "q").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.physDmgRangeAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pd").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.eleDmgRange = element.getElementsByAttributeValue("data-name", "ed").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.attackSpeed = element.getElementsByAttributeValue("data-name", "aps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.dmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_dps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.physDmgAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_pdps").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.eleDmg = element.getElementsByAttributeValue("data-name", "edps").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.armourAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_armour").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.evasionAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_evasion").get(0) .text().replaceAll(regex_horizontal_whitespace, "").trim(); item.energyShieldAtMaxQuality = element.getElementsByAttributeValue("data-name", "quality_shield") .get(0).text().replaceAll(regex_horizontal_whitespace, "").trim(); item.block = element.getElementsByAttributeValue("data-name", "block").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.crit = element.getElementsByAttributeValue("data-name", "crit").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.level = element.getElementsByAttributeValue("data-name", "level").get(0).text() .replaceAll(regex_horizontal_whitespace, "").trim(); item.imageUrl = element.getElementsByAttributeValue("alt", "Item icon").get(0).attr("src"); item.stackSize = asList(split(trimToEmpty(item.imageUrl), '&')).stream() .filter(t -> t.startsWith("stackSize=")).findFirst().map(s -> substringAfter(s, "=")) .orElse(null); Elements onlineSpans = element.getElementsMatchingText("online"); if (!onlineSpans.isEmpty()) { item.online = "Online"; } else { item.online = ""; } tradeItems.add(item); } // System.out.println("DONE --- Items"); return tradeItems; }