List of usage examples for org.jsoup.select Elements toString
@Override
public String toString()
From source file:org.sbs.goodcrawler.extractor.selector.IntegerElementCssSelector.java
@Override public Integer getContent() throws ExtractException { Elements elements = null; try {//from www . ja v a2 s .c o m // content???document2+?? if (null != content && !newDoc) { return content; } if (null != document) { elements = super.document.select(value); if (elements.isEmpty()) return null; String temp; switch ($Attr) { case text: temp = CharMatcher.DIGIT.retainFrom(getExtractText(elements)); break; default: temp = CharMatcher.DIGIT.retainFrom(getExtractAttr(elements, attr)); break; } if (StringUtils.isNotBlank(temp)) { Integer integer = Integer.parseInt(temp); if (null != actions && actions.size() > 0) { for (IntegerSelectorAction action : actions) { this.content = action.doAction(integer); } } else { this.content = integer; } newDoc = false; return content; } } } catch (Exception e) { e.printStackTrace(); log.error(elements.toString()); throw new ExtractException("????:" + e.getMessage()); } return null; }
From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java
/** * Get movie meta data from aebn.net.//from w w w . ja v a 2s . c o m * */ @Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("AEBN: getMetadata() {}", options); // check if there is already meta data present in the result if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) { LOGGER.debug("AEBN: return metadata from cache"); return options.getResult().getMediaMetadata(); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); Elements elements = null; Element element = null; Integer aebnId = 0; // get AebnId from previous search result if ((options.getResult() != null) && (options.getResult().getId() != null)) { aebnId = Integer.parseInt(options.getResult().getId()); LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId); // preset some values from search result (if there is one) // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy". md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, StrgUtils.removeCommonSortableName(options.getResult().getTitle())); } // or get AebnId from options if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) { LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID)); aebnId = Integer.parseInt(options.getId(AEBNID)); } if (!isValidAebnId(aebnId)) { LOGGER.warn("AEBN: no or incorrect aebnId, aborting"); return md; } // ID md.setId(providerInfo.getId(), aebnId); LOGGER.debug("AEBN: aebnId({})", aebnId); // Base download url for data scraping String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId; String locale = options.getLanguage().name(); if (!StringUtils.isBlank(locale)) { downloadUrl = downloadUrl + "&locale=" + locale; LOGGER.debug("AEBN: used locale({})", locale); } // begin download and scrape try { LOGGER.debug("AEBN: download movie detail page"); Url url = new Url(downloadUrl); InputStream in = url.getInputStream(); Document document = Jsoup.parse(in, "UTF-8", ""); in.close(); // Title // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1> LOGGER.debug("AEBN: parse title"); elements = document.getElementsByAttributeValue("class", "md-movieTitle"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieTitle = cleanString(element.text()); LOGGER.debug("AEBN: title({})", movieTitle); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // Poster // front cover: // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg"; md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl); // Fanart/Background // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..." // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." /> LOGGER.debug("AEBN: parse fanart / scene thumbs"); elements = document.getElementsByAttributeValue("class", "SceneThumbnail"); LOGGER.debug("AEBN: {} elements found", elements.size()); int i = 1; for (Element anchor : elements) { String backgroundUrl = anchor.attr("src"); LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl); md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl); i++; } // Runtime LOGGER.debug("AEBN: parse runtime"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieRuntime = cleanString(element.attr("content")); movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M"); LOGGER.debug("AEBN: runtime({})", movieRuntime); md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime); } // Year LOGGER.debug("AEBN: parse year"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieYear = cleanString(element.attr("content")); movieYear = StrgUtils.substr(movieYear, "(\\d+)-"); LOGGER.debug("AEBN: year({})", movieYear); md.storeMetadata(MediaMetadata.YEAR, movieYear); } // Series (Collection) LOGGER.debug("AEBN: parse collection"); elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieCollection = cleanString(element.text()); // Fake a TMDB_SET based on the hash value of the collection name int movieCollectionHash = movieCollection.hashCode(); md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection); md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash); LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash); } // Studio LOGGER.debug("AEBN: parse studio"); elements = document.getElementsByAttributeValue("id", "md-details") .select("[itemprop=productionCompany]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String movieStudio = cleanString(elements.first().text()); LOGGER.debug("AEBN: studio({})", movieStudio); md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio); } // Genre LOGGER.debug("AEBN: parse genre"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]"); for (Element g : elements) { md.addGenre(getTmmGenre(g.text())); } // add basic genre, since all genres at AEBN could be summarised // into this one md.addGenre(MediaGenres.EROTIC); // Certification // no data scrapeable---but obviously it's adult only, so simply // generate it String movieCertification = null; Certification certification = null; String country = options.getCountry().getAlpha2(); LOGGER.debug("AEBN: generate certification for {}", country); // @formatter:off if (country.equals("DE")) { movieCertification = "FSK 18"; } if (country.equals("US")) { movieCertification = "NC-17"; } if (country.equals("GB")) { movieCertification = "R18"; } if (country.equals("FR")) { movieCertification = "18"; } if (country.equals("ES")) { movieCertification = "PX"; } if (country.equals("JP")) { movieCertification = "R18+"; } if (country.equals("IT")) { movieCertification = "V.M.18"; } if (country.equals("NL")) { movieCertification = "16"; } // @formatter:on certification = Certification.getCertification(options.getCountry(), movieCertification); if (certification != null) { LOGGER.debug("AEBN: certification({})", certification); md.addCertification(certification); } // Plot and Tagline LOGGER.debug("AEBN: parse plot"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String moviePlot = cleanString(elements.first().text()); md.storeMetadata(MediaMetadata.PLOT, moviePlot); // no separate tagline available, so extract the first sentence // from the movie plot String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])"); LOGGER.debug("AEBN: tagline(" + movieTagline + ")"); md.storeMetadata(MediaMetadata.TAGLINE, movieTagline); } // Actors LOGGER.debug("AEBN: parse actors"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]"); LOGGER.debug("AEBN: {} actors found", elements.size()); for (Element anchor : elements) { String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)"); String actorname = cleanString(anchor.select("[itemprop=name]").first().text()); String actordetailsurl = BASE_DATAURL + anchor.attr("href"); if (!actorname.isEmpty()) { LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname, actordetailsurl); MediaCastMember cm = new MediaCastMember(); cm.setType(MediaCastMember.CastType.ACTOR); cm.setName(actorname); if (!actorid.isEmpty()) { cm.setId(actorid); } // Actor detail page try { Url starurl = new Url(actordetailsurl); InputStream starurlstream = starurl.getInputStream(); Document stardocument = Jsoup.parse(starurlstream, "UTF-8", ""); starurlstream.close(); Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo"); if (elements2.size() == 0) { LOGGER.debug("AEBN: no additional actor details found"); } else { // Actor image String actorimage = elements2.select("[itemprop=image]").first().attr("src"); LOGGER.debug("AEBN: actor image({})", actorimage); if (!actorimage.isEmpty()) { cm.setImageUrl(actorimage); } // Actor 'fanart' images // unsure if this is ever shown in tmm elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery") .select("a"); LOGGER.debug("AEBN: {} gallery images found", elements2.size()); for (Element thumbnail : elements2) { LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href")); cm.addFanart(thumbnail.attr("href")); } } } catch (Exception e) { LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e); } md.addCastMember(cm); } } // Director LOGGER.debug("AEBN: parse director"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)"); String directorname = cleanString(elements.select("[itemprop=name]").first().text()); if (!directorname.isEmpty()) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(directorname); if (!directorid.isEmpty()) { cm.setId(directorid); } cm.setImageUrl(""); md.addCastMember(cm); LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname); } } // Original Title // if we have no original title, just copy the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } } catch (Exception e) { LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e); } return md; }
From source file:poe.trade.assist.UniquesListSearchGenerator.java
/**imgurl, reqLvl, base, mod * @param args// w w w. j a v a 2 s .c o m * @throws Exception */ public static void main(String[] args) throws Exception { List<String> outputLines = new LinkedList<>(); outputLines.add( "Name Art Req.Level Base Mods TaslismanSC TalismanHC Standard Hardcore poewiki"); for (String list : lists) { HttpResponse<String> response = Unirest.get("http://pathofexile.gamepedia.com/" + list) .header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0") .asString(); Document doc = Jsoup.parse(response.getBody()); Elements elems = doc.select("table.wikitable.sortable"); for (Element table : elems) { Elements rows = table.select("tr"); int ctr = 0; boolean hasRequiredLevel = false; for (Element row : rows) { if (ctr == 0) { // first row is headers hasRequiredLevel = !row.select("abbr[title=\"Required Level\"]").isEmpty(); ctr++; continue; } String name = row.child(0).child(0).attr("title"); System.out.println("Now processing: " + name); String imgurl = "=IMAGE(\"" + row.select("img").attr("src") + "\", 3)"; String base = row.child(1).child(0).attr("title"); String reqLvl = hasRequiredLevel ? row.child(2).text() : "0"; reqLvl = reqLvl.equalsIgnoreCase("n/a") ? "0" : reqLvl; String mod = "=\""; Elements mods = row.select("span.itemboxstatsgroup.text-mod"); if (!mods.isEmpty()) { if (mods.size() > 2) throw new Exception("mods.size() is > 2. " + name + " - " + mods.toString()); boolean hasImplicit = mods.size() > 1; String imp = hasImplicit ? mods.get(0).text() : ""; int expIdx = hasImplicit ? 1 : 0; String lineSeparator = "\"&CHAR(10)&\""; String exp = mods.get(expIdx).textNodes().stream().map(n -> n.text().trim()) .filter(s -> !s.isEmpty()).collect(Collectors.joining(lineSeparator)); String additionalExp = mods.get(expIdx).children().stream().filter(e -> e.hasText()) .map(e -> e.text().trim()).collect(Collectors.joining(lineSeparator)); if (additionalExp != null && !additionalExp.isEmpty()) exp += lineSeparator + additionalExp; mod += imp; if (hasImplicit) mod += (lineSeparator + "--------------" + lineSeparator); mod += exp; } mod += "\""; String standard = "Standard"; String hardcore = "Hardcore"; String tempsc = "Talisman"; String temphc = "Talisman+Hardcore"; String nameenc = URLEncoder.encode(name, "UTF-8"); String sc = hyperlink(getSearchURL(standard, nameenc)); String hc = hyperlink(getSearchURL(hardcore, nameenc)); String tsc = hyperlink(getSearchURL(tempsc, nameenc)); String thc = hyperlink(getSearchURL(temphc, nameenc)); String poewikiurl = hyperlink("http://pathofexile.gamepedia.com/" + (name.replace(' ', '_'))); String s = format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", name, imgurl, reqLvl, base, mod, tsc, thc, sc, hc, poewikiurl); outputLines.add(s); Thread.sleep(1000); } } } FileUtils.writeLines(new File("uniqueslist.txt"), outputLines); }
From source file:ru.dmitry.mamishev.URLParse.HtmlString.java
public GazInfo getInfoBill() { String html = this.htmlString; Document doc = Jsoup.parse(html); Elements ul = doc.getElementsByTag("ul"); String numBill = ""; numBill = ul.eq(1).text();//w ww . j a v a2s . co m Document bElements = Jsoup.parseBodyFragment(ul.toString()); Elements b = bElements.getElementsByTag("b"); GazInfo billInfo = null; String date = ""; String pay = ""; if (b.size() > 2) { String[] ss = SPLIT.split(b.get(2).text()); if (ss.length > 0) { date = ss[0]; pay = ss[1]; } billInfo = new GazInfo(b.get(0).text(), b.get(1).text(), date, pay, numBill); } else { billInfo = new GazInfo("", "", date, pay, ""); } return billInfo; }