List of usage examples for org.jsoup.nodes Element nextElementSibling
public Element nextElementSibling()
From source file:de.geeksfactory.opacclient.apis.Zones.java
@Override public AccountData account(Account acc) throws IOException, JSONException, OpacErrorException { Document login = login(acc);//from w ww . j a va 2s .c o m if (login == null) { return null; } AccountData res = new AccountData(acc.getId()); String lentLink = null; String resLink = null; int lent_cnt = -1; int res_cnt = -1; for (Element td : login.select(".AccountSummaryCounterNameCell, .AccountSummaryCounterNameCellStripe, " + ".CAccountDetailFieldNameCellStripe, .CAccountDetailFieldNameCell")) { String section = td.text().trim(); if (section.contains("Entliehene Medien")) { lentLink = td.select("a").attr("href"); lent_cnt = Integer.parseInt(td.nextElementSibling().text().trim()); } else if (section.contains("Vormerkungen")) { resLink = td.select("a").attr("href"); res_cnt = Integer.parseInt(td.nextElementSibling().text().trim()); } else if (section.contains("Kontostand")) { res.setPendingFees(td.nextElementSibling().text().trim()); } else if (section.matches("Ausweis g.ltig bis")) { res.setValidUntil(td.nextElementSibling().text().trim()); } } for (Element a : login.select("a.AccountMenuLink")) { if (a.text().contains("Ausleihen")) { lentLink = a.attr("href"); } else if (a.text().contains("Vormerkungen")) { resLink = a.attr("href"); } } if (lentLink == null) { return null; } List<LentItem> lentItems = new ArrayList<>(); String lentUrl = opac_url + "/" + lentLink.replace("utf-8?Method", "utf-8&Method"); String lentHtml = httpGet(lentUrl, getDefaultEncoding()); Document lentDoc = Jsoup.parse(lentHtml); lentDoc.setBaseUri(lentUrl); loadMediaList(lentDoc, lentItems); res.setLent(lentItems); // In Koeln, the reservations link only doesn't show on the overview page if (resLink == null) { for (Element a : lentDoc.select("a.AccountMenuLink")) { if (a.text().contains("Vormerkungen")) { resLink = a.attr("href"); } } } List<ReservedItem> reservedItems = new ArrayList<>(); String resHtml = httpGet(opac_url + "/" + resLink, getDefaultEncoding()); Document resDoc = Jsoup.parse(resHtml); loadResList(resDoc, reservedItems); res.setReservations(reservedItems); return res; }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
void parseDay(SubstitutionScheduleDay day, Element next, SubstitutionSchedule v, String klasse) throws JSONException, CredentialInvalidException { if (next.className().equals("subst") || next.select(".list").size() > 0 || next.text().contains("Vertretungen sind nicht freigegeben") || next.text().contains("Keine Vertretungen")) { //Vertretungstabelle if (next.text().contains("Vertretungen sind nicht freigegeben")) { return; }/* ww w. jav a 2 s . co m*/ parseSubstitutionScheduleTable(next, scheduleData.getData(), day, klasse); } else { //Nachrichten parseMessages(next, day); next = next.nextElementSibling().nextElementSibling(); parseSubstitutionScheduleTable(next, scheduleData.getData(), day, klasse); } v.addDay(day); }
From source file:me.vertretungsplan.parser.ESchoolParser.java
private void parseTable(Element table, SubstitutionScheduleDay day) { for (Element th : table.select("th[colspan=10]")) { String lesson;/*from w w w . j a v a 2s.co m*/ Pattern pattern = Pattern.compile("(\\d+)\\. Stunde"); Matcher matcher = pattern.matcher(th.text()); if (matcher.find()) { lesson = matcher.group(1); } else { lesson = th.text(); } // skip over table headers Element row = th.parent().nextElementSibling().nextElementSibling(); while (row != null && row.select("th").size() == 0) { Substitution subst = new Substitution(); subst.setLesson(lesson); Elements columns = row.select("td"); String[] classes = columns.get(0).text().split(", |\\+"); subst.setClasses(new HashSet<>(Arrays.asList(classes))); subst.setPreviousTeacher(getPreviousValue(columns.get(1))); subst.setTeacher(getNewValue(columns.get(1))); subst.setPreviousSubject(getPreviousValue(columns.get(2))); subst.setSubject(getNewValue(columns.get(2))); subst.setPreviousRoom(getPreviousValue(columns.get(3))); subst.setRoom(getNewValue(columns.get(3))); if (columns.get(4).text().isEmpty()) { subst.setType("Vertretung"); subst.setColor(colorProvider.getColor("Vertretung")); } else { String desc = columns.get(4).text(); subst.setDesc(desc); String recognizedType = recognizeType(desc); if (recognizedType == null) recognizedType = "Vertretung"; subst.setType(recognizedType); subst.setColor(colorProvider.getColor(recognizedType)); } day.addSubstitution(subst); row = row.nextElementSibling(); } } }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * ??:/* w ww. j a v a2s . c o m*/ * 1. ??? * 2. ??????? * 3. ?? * 4. ?? ?? * 5. ? * * @return * @throws XpathSyntaxErrorException */ private String getAuthor() throws XpathSyntaxErrorException { String author = ""; if (StringUtils.isBlank(srcTime)) { author = getAuthor(doc.body().html()); return author; } Element cur = doc.body().select("*:containsOwn(" + srcTime + ")").first(); if (cur == null) { LOG.warn("?srcTime=" + srcTime); author = getAuthor(doc.body().html()); return author; } if (!noText(cur)) { String arr[] = cur.html().split(srcTime); for (String text : arr) { author = getShortText(text); if (!StringUtils.isBlank(author)) return author; } } Element parent = cur.parent(); while (parent != null && noText(parent)) { cur = parent; parent = parent.parent(); } author = getAuthor(parent.html()); if (!StringUtils.isBlank(author)) return author; Element pre = cur.previousElementSibling(); while (pre != null && noText(pre)) { pre = pre.previousElementSibling(); } if (pre != null) { author = getShortText(pre.text()); } if (!StringUtils.isBlank(author)) return author; Element next = cur.nextElementSibling(); while (next != null && noText(next)) { next = next.nextElementSibling(); } if (next != null) { author = getShortText(next.text()); } if (!StringUtils.isBlank(author)) return author; author = getShortText(parent.html().replace(srcTime, " ")); if (!StringUtils.isBlank(author)) return author; author = getAuthor(doc.body().html()); if (StringUtils.isBlank(author)) { return author_bak; } return author; }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
@Override public List<SearchField> getSearchFields() throws IOException { List<SearchField> fields = new ArrayList<>(); HttpGet httpget;//from w w w . ja va2 s . c o m if (opacDir.contains("opax")) { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel.html.S"); } else { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel_main.S"); } HttpResponse response = http_client.execute(httpget); if (response.getStatusLine().getStatusCode() == 500) { throw new NotReachableException(response.getStatusLine().getReasonPhrase()); } String html = convertStreamToString(response.getEntity().getContent()); HttpUtils.consume(response.getEntity()); Document doc = Jsoup.parse(html); // get text fields Elements text_opts = doc.select("form select[name=REG1] option"); for (Element opt : text_opts) { TextSearchField field = new TextSearchField(); field.setId(opt.attr("value")); field.setDisplayName(opt.text()); field.setHint(""); fields.add(field); } // get media types Elements mt_opts = doc.select("form input[name~=(MT|MS)]"); if (mt_opts.size() > 0) { DropdownSearchField mtDropdown = new DropdownSearchField(); mtDropdown.setId(mt_opts.get(0).attr("name")); mtDropdown.setDisplayName("Medientyp"); for (Element opt : mt_opts) { if (!opt.val().equals("")) { String text = opt.text(); if (text.length() == 0) { // text is empty, check layouts: // Essen: <input name="MT"><img title="mediatype"> // Schaffenb: <input name="MT"><img alt="mediatype"> Element img = opt.nextElementSibling(); if (img != null && img.tagName().equals("img")) { text = img.attr("title"); if (text.equals("")) { text = img.attr("alt"); } } } if (text.length() == 0) { // text is still empty, check table layout, Example // Friedrichshafen // <td><input name="MT"></td> <td><img // title="mediatype"></td> Element td1 = opt.parent(); Element td2 = td1.nextElementSibling(); if (td2 != null) { Elements td2Children = td2.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty, check images in label layout, Example // Wiedenst // <input type="radio" name="MT" id="MTYP1" value="MTYP1"> // <label for="MTYP1"><img src="http://www.wiedenest.de/bib/image/books // .png" alt="Bcher" title="Bcher"></label> Element label = opt.nextElementSibling(); if (label != null) { Elements td2Children = label.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty: missing end tag like Offenburg text = parse_option_regex(opt); } mtDropdown.addDropdownValue(opt.val(), text); } } fields.add(mtDropdown); } // get branches Elements br_opts = doc.select("form select[name=ZW] option"); if (br_opts.size() > 0) { DropdownSearchField brDropdown = new DropdownSearchField(); brDropdown.setId(br_opts.get(0).parent().attr("name")); brDropdown.setDisplayName(br_opts.get(0).parent().parent().previousElementSibling().text() .replace("\u00a0", "").replace("?", "").trim()); for (Element opt : br_opts) { brDropdown.addDropdownValue(opt.val(), opt.text()); } fields.add(brDropdown); } return fields; }
From source file:net.niyonkuru.koodroid.html.SubscribersHandler.java
@Override public ArrayList<ContentProviderOperation> parse(Document doc, ContentResolver resolver) throws HandlerException { final ArrayList<ContentProviderOperation> batch = new ArrayList<ContentProviderOperation>(); Element subscriberLi = doc.select("div#banSelector li:has(div)").first(); while (subscriberLi != null) { String text = subscriberLi.text(); /* this assumes the name and phone number are separated by a space */ int separator = text.lastIndexOf(' ') + 1; String subscriberId = text.substring(separator).replaceAll("\\D", ""); if (subscriberId.length() != 10) throw new HandlerException(getString(R.string.parser_error_unexpected_input)); final ContentProviderOperation.Builder builder; final Uri subscriberUri = Subscribers.buildSubscriberUri(subscriberId); if (subscriberExists(subscriberUri, resolver)) { builder = ContentProviderOperation.newUpdate(subscriberUri); builder.withValue(Subscribers.UPDATED, System.currentTimeMillis()); } else {/*ww w .ja v a 2 s . c om*/ builder = ContentProviderOperation.newInsert(Subscribers.CONTENT_URI); } builder.withValue(Subscribers.SUBSCRIBER_ID, subscriberId); String fullName = ""; String[] names = text.substring(0, separator).split("\\s"); for (String name : names) { fullName += ParserUtils.capitalize(name) + " "; } builder.withValue(Subscribers.SUBSCRIBER_FULL_NAME, fullName.trim()); if (subscriberLi.hasAttr("onClick")) { String switchUrl = subscriberLi.attr("onClick"); /* extract only the url */ switchUrl = switchUrl.substring(switchUrl.indexOf('/'), switchUrl.lastIndexOf('\'')); builder.withValue(Subscribers.SUBSCRIBER_SWITCHER, switchUrl); } else { /* this is the default subscriber as it doesn't have a switcher url */ ContentValues cv = new ContentValues(1); cv.put(Settings.SUBSCRIBER, subscriberId); resolver.insert(Settings.CONTENT_URI, cv); } builder.withValue(Subscribers.SUBSCRIBER_EMAIL, mParent); batch.add(builder.build()); subscriberLi = subscriberLi.nextElementSibling(); } if (batch.size() == 0) throw new HandlerException(getString(R.string.parser_error_unexpected_input)); JSONObject metadata = new JSONObject(); try { metadata.put("subscribers", batch.size()); metadata.put("language", getString(R.string.locale)); } catch (JSONException ignored) { } Crittercism.setMetadata(metadata); Crittercism.setUsername(mParent); return batch; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMetadata(); }//w w w. j a v a2 s . c o m MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(MediaMetadata.IMDBID, imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>( executor); // worker for imdb request (/combined) (everytime from akas.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/combined"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); Future<Document> futureCombined = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary = null; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureCombined.get(); /* * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // parse title and year Element title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title Elements elements = title.getElementsByTag("h1"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // year elements = title.getElementsByTag("span"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); // search year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); Matcher matcher = yearPattern.matcher(content); while (matcher.find()) { if (matcher.group(1) != null) { String movieYear = matcher.group(1); md.storeMetadata(MediaMetadata.YEAR, movieYear); break; } } } // original title elements = title.getElementsByAttributeValue("class", "title-extra"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); content = content.replaceAll("\\(original title\\)", "").trim(); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content); } } // poster Element poster = doc.getElementById("primary-poster"); if (poster != null) { String posterUrl = poster.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementById("tn15rating"); if (ratingElement != null) { Elements elements = ratingElement.getElementsByClass("starbar-meta"); if (elements.size() > 0) { Element div = elements.get(0); // rating comes in <b> tag Elements b = div.getElementsByTag("b"); if (b.size() == 1) { String ratingAsString = b.text(); Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10"); Matcher matcher = ratingPattern.matcher(ratingAsString); while (matcher.find()) { if (matcher.group(1) != null) { float rating = 0; try { rating = Float.valueOf(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.RATING, rating); break; } } } // count Elements a = div.getElementsByAttributeValue("href", "ratings"); if (a.size() == 1) { String countAsString = a.text().replaceAll("[.,]|votes", "").trim(); int voteCount = 0; try { voteCount = Integer.parseInt(countAsString); } catch (Exception e) { } md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount); } } // top250 elements = ratingElement.getElementsByClass("starbar-special"); if (elements.size() > 0) { Elements a = elements.get(0).getElementsByTag("a"); if (a.size() > 0) { Element anchor = a.get(0); Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(anchor.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { int top250 = 0; try { top250 = Integer.parseInt(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.TOP_250, top250); } } } } } // parse all items coming by <div class="info"> Elements elements = doc.getElementsByClass("info"); for (Element element : elements) { // only parse divs if (!"div".equals(element.tag().getName())) { continue; } // elements with h5 are the titles of the values Elements h5 = element.getElementsByTag("h5"); if (h5.size() > 0) { Element firstH5 = h5.first(); String h5Title = firstH5.text(); // release date /* * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline" * href="/title/tt0114746/releaseinfo" * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a> </div></div> */ if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element releaseDateElement = div.first(); String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", "")); Pattern pattern = Pattern.compile("(.*)\\(.*\\)"); Matcher matcher = pattern.matcher(releaseDate); if (matcher.find()) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy"); Date parsedDate = sdf.parse(matcher.group(1)); sdf = new SimpleDateFormat("dd-MM-yyyy"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate)); } catch (Exception e) { } } } } /* * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline" * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See * more</a> » </div></div> */ // tagline if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*") && !options.isScrapeImdbForeignLanguage()) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.storeMetadata(MediaMetadata.TAGLINE, tagline); } } /* * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick= * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a> » </div> */ // genres are only scraped from akas.imdb.com if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Elements a = div.first().getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/Sections/Genres/.*")) { md.addGenre(getTmmGenre(anchor.ownText())); } } } } // } /* * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div> */ // runtime // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String first = taglineElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.storeMetadata(MediaMetadata.RUNTIME, runtime); } } /* * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div> */ // country if (h5Title.matches("(?i)Country.*")) { Elements a = element.getElementsByTag("a"); String countries = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/country/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String country = matcher.group(1); if (StringUtils.isNotEmpty(countries)) { countries += ", "; } countries += country.toUpperCase(); } } md.storeMetadata(MediaMetadata.COUNTRY, countries); } /* * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div> */ // Spoken languages if (h5Title.matches("(?i)Language.*")) { Elements a = element.getElementsByTag("a"); String spokenLanguages = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/language/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String langu = matcher.group(1); if (StringUtils.isNotEmpty(spokenLanguages)) { spokenLanguages += ", "; } spokenLanguages += langu; } } md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages); } /* * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div> */ // certification // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { // certification for the right country if (anchor.attr("href").matches( "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) { Pattern certificationPattern = Pattern.compile(".*:(.*)"); Matcher matcher = certificationPattern.matcher(anchor.ownText()); Certification certification = null; while (matcher.find()) { if (matcher.group(1) != null) { certification = Certification.getCertification(options.getCountry(), matcher.group(1)); } } if (certification != null) { md.addCertification(certification); break; } } } } } /* * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick= * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div> */ // director if ("director-info".equals(element.id())) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } } /* * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick= * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src= * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick= * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick= * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick= * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table> */ // cast elements = doc.getElementsByClass("cast"); if (elements.size() > 0) { Elements tr = elements.get(0).getElementsByTag("tr"); for (Element row : tr) { Elements td = row.getElementsByTag("td"); MediaCastMember cm = new MediaCastMember(); for (Element column : td) { // actor thumb if (column.hasClass("hs")) { Elements img = column.getElementsByTag("img"); if (img.size() > 0) { String thumbUrl = img.get(0).attr("src"); if (thumbUrl.contains("no_photo.png")) { cm.setImageUrl(""); } else { thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", ""); cm.setImageUrl(thumbUrl); } } } // actor name if (column.hasClass("nm")) { cm.setName(cleanString(column.text())); } // character if (column.hasClass("char")) { cm.setCharacter(cleanString(column.text())); } } if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(CastType.ACTOR); md.addCastMember(cm); } } } Element content = doc.getElementById("tn15content"); if (content != null) { elements = content.getElementsByTag("table"); for (Element table : elements) { // writers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) { Elements anchors = table.getElementsByTag("a"); for (Element anchor : anchors) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.WRITER); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } // producers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { continue; } Elements columns = row.children(); if (columns.size() == 0) { continue; } MediaCastMember cm = new MediaCastMember(CastType.PRODUCER); String name = cleanString(columns.get(0).text()); if (StringUtils.isBlank(name)) { continue; } cm.setName(name); if (columns.size() >= 3) { cm.setPart(cleanString(columns.get(2).text())); } md.addCastMember(cm); } } } } // Production companies elements = doc.getElementsByClass("blackcatheader"); for (Element blackcatheader : elements) { if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) { Elements a = blackcatheader.nextElementSibling().getElementsByTag("a"); StringBuilder productionCompanies = new StringBuilder(); for (Element anchor : a) { if (StringUtils.isNotEmpty(productionCompanies)) { productionCompanies.append(", "); } productionCompanies.append(anchor.ownText()); } md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString()); break; } } /* * plot from /plotsummary */ // build the url doc = null; doc = futurePlotsummary.get(); // imdb.com has another site structure if (imdbSite == ImdbSiteDefinition.IMDB_COM) { Elements zebraList = doc.getElementsByClass("zebraList"); if (zebraList != null && !zebraList.isEmpty()) { Elements odd = zebraList.get(0).getElementsByClass("odd"); if (odd.isEmpty()) { odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even } if (odd.size() > 0) { Elements p = odd.get(0).getElementsByTag("p"); if (p.size() > 0) { String plot = cleanString(p.get(0).ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } } } // } // get data from tmdb? if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { MediaMetadata tmdbMd = futureTmdb.get(); if (options.isScrapeImdbForeignLanguage() && tmdbMd != null && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) { // tmdbid md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID)); // title md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE)); // original title md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE)); // tagline md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE)); // plot md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT)); // collection info md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); } if (options.isScrapeCollectionInfo() && tmdbMd != null) { md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) { /*/*w ww . j av a 2s . c om*/ * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // title Element title = doc.getElementsByAttributeValue("name", "title").first(); if (title != null) { String movieTitle = cleanString(title.attr("content")); int yearStart = movieTitle.lastIndexOf("("); if (yearStart > 0) { movieTitle = movieTitle.substring(0, yearStart - 1).trim(); md.setTitle(movieTitle); } } // original title and year Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first(); if (originalTitleYear != null) { String content = originalTitleYear.attr("content"); int startOfYear = content.lastIndexOf("("); if (startOfYear > 0) { // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page... // String originalTitle = content.substring(0, startOfYear - 1).trim(); // md.setOriginalTitle(originalTitle); String yearText = content.substring(startOfYear); // search year Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}"); Matcher matcher = yearPattern.matcher(yearText); while (matcher.find()) { if (matcher.group(0) != null) { String movieYear = matcher.group(0); try { md.setYear(Integer.parseInt(movieYear)); break; } catch (Exception ignored) { } } } } } // poster Element poster = doc.getElementsByAttributeValue("property", "og:image").first(); if (poster != null) { String posterUrl = poster.attr("content"); int fileStart = posterUrl.lastIndexOf("/"); if (fileStart > 0) { int parameterStart = posterUrl.indexOf("_", fileStart); if (parameterStart > 0) { int startOfExtension = posterUrl.lastIndexOf("."); if (startOfExtension > parameterStart) { posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension); } } } processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { md.setRating(Float.valueOf(ratingAsString)); } catch (Exception ignored) { } Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim(); try { md.setVoteCount(Integer.parseInt(countAsString)); } catch (Exception ignored) { } } } // top250 Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first(); if (topRatedElement != null) { Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(topRatedElement.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { try { String top250Text = matcher.group(1); md.setTop250(Integer.parseInt(top250Text)); } catch (Exception ignored) { } } } } // releasedate Element releaseDateElement = doc .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo") .first(); if (releaseDateElement != null) { String releaseDateText = releaseDateElement.ownText(); int startOfCountry = releaseDateText.indexOf("("); if (startOfCountry > 0) { releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim(); } try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); Date parsedDate = sdf.parse(releaseDateText); md.setReleaseDate(parsedDate); } catch (ParseException ignored) { } } } Elements elements = doc.getElementsByClass("ipl-zebra-list__label"); for (Element element : elements) { // only parse tds if (!"td".equals(element.tag().getName())) { continue; } String elementText = element.ownText(); if (elementText.equals("Taglines")) { if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) { Element taglineElement = element.nextElementSibling(); if (taglineElement != null) { String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.setTagline(tagline); } } } if (elementText.equals("Genres")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/"); for (Element genreElement : genreElements) { String genreText = genreElement.ownText(); md.addGenre(getTmmGenre(genreText)); } } } /* * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) * | 178 min (extended cut)</div></div> */ if (elementText.equals("Runtime")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first(); if (runtimeElement != null) { String first = runtimeElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.setRuntime(runtime); } } } if (elementText.equals("Country")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/"); Pattern pattern = Pattern.compile("/country/(.*)"); for (Element countryElement : countryElements) { Matcher matcher = pattern.matcher(countryElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addCountry(LanguageUtils.getLocalizedCountryForLanguage( options.getLanguage().getLanguage(), countryElement.text(), matcher.group(1))); } else { md.addCountry(matcher.group(1)); } } } } } if (elementText.equals("Language")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { Elements languageElements = nextElement.getElementsByAttributeValueStarting("href", "/language/"); Pattern pattern = Pattern.compile("/language/(.*)"); for (Element languageElement : languageElements) { Matcher matcher = pattern.matcher(languageElement.attr("href")); if (matcher.matches()) { if (ImdbMetadataProvider.providerInfo.getConfig() .getValueAsBool("scrapeLanguageNames")) { md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString( options.getLanguage(), languageElement.text(), matcher.group(1))); } else { md.addSpokenLanguage(matcher.group(1)); } } } } } if (elementText.equals("Certification")) { Element nextElement = element.nextElementSibling(); if (nextElement != null) { String languageCode = options.getCountry().getAlpha2(); Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=" + languageCode); boolean done = false; for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); done = true; break; } } if (!done && languageCode.equals("DE")) { certificationElements = nextElement.getElementsByAttributeValueStarting("href", "/search/title?certificates=XWG"); for (Element certificationElement : certificationElements) { String certText = certificationElement.ownText(); int startOfCert = certText.indexOf(":"); if (startOfCert > 0 && certText.length() > startOfCert + 1) { certText = certText.substring(startOfCert + 1); } Certification certification = Certification.getCertification(options.getCountry(), certText); if (certification != null) { md.addCertification(certification); break; } } } } } } // director Element directorsElement = doc.getElementById("directors"); while (directorsElement != null && directorsElement.tag().getName() != "header") { directorsElement = directorsElement.parent(); } if (directorsElement != null) { directorsElement = directorsElement.nextElementSibling(); } if (directorsElement != null) { for (Element directorElement : directorsElement.getElementsByClass("name")) { String director = directorElement.text().trim(); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR); cm.setName(director); md.addCastMember(cm); } } // actors Element castTableElement = doc.getElementsByClass("cast_list").first(); if (castTableElement != null) { Elements tr = castTableElement.getElementsByTag("tr"); for (Element row : tr) { MediaCastMember cm = parseCastMember(row); if (cm != null && StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(cm); } } } // writers Element writersElement = doc.getElementById("writers"); while (writersElement != null && writersElement.tag().getName() != "header") { writersElement = writersElement.parent(); } if (writersElement != null) { writersElement = writersElement.nextElementSibling(); } if (writersElement != null) { Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element writerElement : writersElements) { String writer = cleanString(writerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER); cm.setName(writer); md.addCastMember(cm); } } // producers Element producersElement = doc.getElementById("producers"); while (producersElement != null && producersElement.tag().getName() != "header") { producersElement = producersElement.parent(); } if (producersElement != null) { producersElement = producersElement.nextElementSibling(); } if (producersElement != null) { Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/"); for (Element producerElement : producersElements) { String producer = cleanString(producerElement.ownText()); MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER); cm.setName(producer); md.addCastMember(cm); } } // producers Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title"); Element prodCompHeaderElement = null; for (Element possibleProdCompHeaderEl : prodCompHeaderElements) { if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) { prodCompHeaderElement = possibleProdCompHeaderEl; break; } } while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") { prodCompHeaderElement = prodCompHeaderElement.parent(); } if (prodCompHeaderElement != null) { prodCompHeaderElement = prodCompHeaderElement.nextElementSibling(); } if (prodCompHeaderElement != null) { Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href", "/company/"); for (Element prodCompElement : prodCompElements) { String prodComp = prodCompElement.ownText(); md.addProductionCompany(prodComp); } } return md; }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
private void parseCast(Elements el, MediaCastMember.CastType type, MediaMetadata md) { if (el != null && !el.isEmpty()) { Element castEl = null; for (Element element : el) { if (!element.tagName().equals("option")) { // we get more, just do not take the optionbox castEl = element;/* w ww.j a v a 2s.co m*/ } } if (castEl == null) { LOGGER.debug("meh, no " + type.name() + " found"); return; } // walk up to table TR... while (!((castEl == null) || (castEl.tagName().equalsIgnoreCase("tr")))) { castEl = castEl.parent(); } // ... and take the next table row ^^ Element tr = castEl.nextElementSibling(); if (tr != null) { for (Element a : tr.getElementsByAttributeValue("valign", "middle")) { String act = a.toString(); String aname = StrgUtils.substr(act, "alt=\"(.*?)\""); if (!aname.isEmpty()) { MediaCastMember cm = new MediaCastMember(); cm.setName(aname); String id = StrgUtils.substr(act, "id=(.*?)[^\"]\">"); if (!id.isEmpty()) { cm.setId(id); // thumb // http://www.ofdb.de/thumbnail.php?cover=images%2Fperson%2F7%2F7689.jpg&size=6 // fullsize ;) http://www.ofdb.de/images/person/7/7689.jpg try { String imgurl = URLDecoder .decode(StrgUtils.substr(act, "images%2Fperson%2F(.*?)&size"), "UTF-8"); if (!imgurl.isEmpty()) { imgurl = BASE_URL + "/images/person/" + imgurl; } cm.setImageUrl(imgurl); } catch (Exception e) { } } String arole = StrgUtils.substr(act, "\\.\\.\\. (.*?)</font>").replaceAll("<[^>]*>", ""); cm.setCharacter(arole); cm.setType(type); md.addCastMember(cm); } } } } }
From source file:solarrecorder.SolarRecorder.java
private void getProdData() throws IOException { org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy/production").get(); Element h1 = doc.getElementsByTag("h1").first(); Element table = h1.nextElementSibling(); Elements alltr = table.getElementsByTag("tbody").first().getElementsByTag("tr"); for (Element tr : alltr) { Elements alltd = tr.getElementsByTag("td"); if (alltd.size() == 2) { String name = alltd.first().text(); String value = alltd.last().text(); switch (name) { case "Currently": case "Today": envoyData.add(new EnvoyData(name, value)); break; }/*w ww. j a v a 2 s .co m*/ } } }