List of usage examples for org.jsoup.nodes Element toString
public String toString()
From source file:org.codeexample.anchorlinks.CVAnchorContentIndexingFilter.java
public Map<String, String> parseAnchors(Document rootDoc) throws IOException { Map<String, String> anchorContents = new LinkedHashMap<String, String>(); Element rootElement = rootDoc; if (regexBodyRoot != null) { rootElement = rootDoc.select(regexBodyRoot).first(); }//from www .j a v a 2 s .c om if (rootElement == null) return anchorContents; Set<String> anchors = getAnchors(rootElement); if (anchors.isEmpty()) return anchorContents; StringBuilder remaining = new StringBuilder(rootElement.toString()); Iterator<String> it = anchors.iterator(); String current = it.next(); while (it.hasNext() && remaining.length() > 0) { String next = it.next(); anchorContents.put(current, getContentBetweenAnchorInWiki(remaining, current, next)); current = next; } // last one String lastTxt = Jsoup.parse(remaining.toString()).text(); if (StringUtils.isNotBlank(lastTxt)) { anchorContents.put(current, lastTxt); } return anchorContents; }
From source file:org.keionline.keionline.ArticleView.java
private String getContent(String url) throws IOException { Document doc = Jsoup.connect(url).userAgent("Mozilla").get(); Element data = doc.getElementsByClass("node").first();// get the third content div, Elements select = data.select("img"); // Change the links to absolute!! so that images work for (Element e : select) { e.attr("src", e.absUrl("src")); }//from w w w .jav a2 s. co m select = data.select("a"); for (Element e : select) { e.attr("href", e.absUrl("href")); } Element info = data.getElementsByClass("submitted").first(); info.after("<hr>"); String cont = data.toString(); cont = CSS + cont + "</body>"; content = cont; return cont; }
From source file:org.mar9000.space2latex.WikiPage.java
public static void downloadWikiPageImages(WikiPage page) throws MalformedURLException { String pageUrl = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); Document document = Jsoup.parseBodyFragment(page.storage); document.outputSettings().prettyPrint(false); Elements images = document.select("ac|image"); if (images.size() > 0) LOGGER.info(" Download images:"); for (Element element : images) { String downloadURL = null; String imageKey = null;/*from w ww. ja v a 2 s . c o m*/ // Attachment? Elements refs = element.select("ri|attachment"); WikiImage image = new WikiImage(); image.pageId = page.id; image.acImage = element.outerHtml(); // if (refs.size() > 0) { // Attachment. Element riAttachment = refs.get(0); imageKey = riAttachment.attr("ri:filename"); Elements riPages = riAttachment.select("ri|page"); // Thumbnails are not found with "child/attachment" URL schema. boolean isThumbnail = "true".equals(element.attr("ac:thumbnail")); String queryURL = null; if (!isThumbnail) { queryURL = pageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey); } else { // For thumbnail we construct directly the downloadURL without queryURL. /* Some pages have thumbnail images for better online reading. * Here we download always the attached file to embed readable imagesinto the pdf. downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/thumbnails/" + page.id + "/" + URLEncoder.encode(imageKey); */ downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/attachments/" + page.id + "/" + URLEncoder.encode(imageKey); } if (riPages.size() > 0) { // The attachment is related with another page. Element riPage = riPages.get(0); String space = riPage.attr("ri:space-key"); String contentTitle = riPage.attr("ri:content-title").replaceAll(" ", "%20"); String self = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); String newQueryURL = self.substring(0, self.lastIndexOf('/')) + "?title=" + contentTitle + "&spaceKey=" + space; JSONObject jsonNewQuery = ConfluenceRESTUtils.getURLResponse(newQueryURL); if (jsonNewQuery.getInt(JSON_SIZE_ATTR) == 0) throw new RuntimeException( "Page \"" + contentTitle + "\" in space " + space + " not found."); JSONObject jsonNewPage = (JSONObject) jsonNewQuery.getJSONArray(JSON_RESULTS_ATTR).get(0); image.pageId = jsonNewPage.getString(JSON_ID_ATTR); // Overwrite queryURL. String newPageUrl = jsonNewPage.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); queryURL = newPageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey); } if (!isThumbnail) downloadURL = getAttachmentDownloadURL(queryURL); } else { refs = element.select("ri|url"); if (refs.size() > 0) { // URL. downloadURL = refs.get(0).attr("ri:value"); URL tempURL = new URL(downloadURL); String urlPath = tempURL.getPath(); imageKey = urlPath.substring(urlPath.lastIndexOf('/') + 1); } else { throw new RuntimeException("Image format unknown: " + element.toString()); } } // Download the image data. image.filename = imageKey.replace(' ', '_'); // Space are not handled by LaTeX. if (downloadURL != null) { LOGGER.info(" about to download image {}/{}", new Object[] { image.pageId, image.filename }); image.data = IOUtils.getImageFromURL(downloadURL); } else { LOGGER.info(" NULL download URL for page/image: {}/{}", new Object[] { image.pageId, image.filename }); } page.images.put(imageKey, image); } }
From source file:org.opens.rules.doc.utils.exportdomtocsv.ExportDomToCsv.java
/** * Before using it please set the FOLDER variable with the path where you * want to create your csv file.//from ww w. ja v a 2s.c o m * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { File ref = FileUtils.getFile(FOLDER); JsoupFunc jsf = new JsoupFunc(); Document doc = jsf.getDocument(); Elements thematiques = doc.select("div.thematique"); StringBuilder sb = new StringBuilder(); String testCode = ""; String testLabel = ""; String critere = ""; for (int i = 2; i < thematiques.size(); i++) { String themeIndex = String.valueOf(i - 1) + ""; String theme = (thematiques.get(i).child(0).text() + ""); Elements criteres = thematiques.get(i).select("h3"); for (int j = 1; j < criteres.size(); j++) { Element critereLevel = criteres.get(j); String critereH3String = critereLevel.toString(); String level = critereH3String.substring(critereH3String.indexOf("[") + 1, critereH3String.indexOf("]")) + ""; Elements tests = criteres.get(j).nextElementSibling().select("[id^=test-]"); try { critere = criteres.get(j).id().substring(5, 10) + ""; } catch (StringIndexOutOfBoundsException sioobe) { try { critere = criteres.get(j).id().substring(5, 9) + ""; } catch (StringIndexOutOfBoundsException sioobe2) { critere = criteres.get(j).id().substring(5, 8) + ""; } } String[] critereArray = criteres.get(j).text().split("] "); String critereLabel = critereArray[1].toString() + ""; for (Element el : tests) { Pattern digitPattern = Pattern.compile("\\d+\\.\\d+\\.\\d+\\s?\\:?\\s?"); Matcher matcher = digitPattern.matcher(el.text()); if (matcher.find()) { String testLabelReplace = el.html() .replace("index.php", "http://www.accessiweb.org/index.php").replace("\n", ""); testLabel = testLabelReplace.substring(matcher.end(), testLabelReplace.length()) + ""; } try { testCode = el.id().substring(5, 12) + ""; } catch (StringIndexOutOfBoundsException sioobe) { try { testCode = (el.id().substring(5, 11) + ""); } catch (StringIndexOutOfBoundsException sioobe3) { testCode = (el.id().substring(5, 10) + ""); } } sb.append(themeIndex + theme + critere + critereLabel + testCode + testLabel + level + "\n"); } } } FileUtils.writeStringToFile(ref, sb.toString()); }
From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java
/** * Search for movies at aebn.net./*from www . ja v a2 s .co m*/ * */ @Override public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception { LOGGER.debug("AEBN: search() {}", query); List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>(); Elements movies = null; String searchString = ""; // Search for query if (StringUtils.isNotEmpty(query.get(MediaSearchOptions.SearchParam.QUERY))) { searchString = query.get(MediaSearchOptions.SearchParam.QUERY); } // Search String searchUrl = BASE_DATAURL + "/dispatcher/fts?userQuery=" + URLEncoder.encode(cleanSearchQuery(searchString), "UTF-8") + "&targetSearchMode=basic&isAdvancedSearch=true&isFlushAdvancedSearchCriteria=false" + "&count=" + SEARCH_COUNT.toString() + "&imageType=Large&sortType=Relevance"; try { LOGGER.info("========= BEGIN AEBN Scraper Search for: {}", searchString); Url url = new Url(searchUrl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); // only look for movie links like // <a id="FTSMovieSearch_link_title_detail_30" ... </a> movies = doc.getElementsByAttributeValueMatching("id", "FTSMovieSearch_link_title_detail_\\d+"); LOGGER.debug("AEBN: found {} search results", movies.size()); } catch (Exception e) { LOGGER.error("AEBN: failed to search for {}: ", searchString, e); } if (movies == null || movies.isEmpty()) { LOGGER.debug("AEBN: no movie found"); return resultList; } // there are search results, so fill media data structure HashSet<String> foundResultUrls = new HashSet<String>(); for (Element anchor : movies) { try { String movieUrl = BASE_DATAURL + StrgUtils.substr(anchor.toString(), "href=\\\"(.*?)\\\""); String movieId = StrgUtils.substr(anchor.toString(), "movieId=(\\d+)"); String movieName = StringEscapeUtils.unescapeHtml4(anchor.text()); String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + movieId + "_160w.jpg"; LOGGER.debug("AEBN: found movie {} (id{})", movieName, movieId); // check if it is a valid AEBN id if (!isValidAebnId(Integer.parseInt(movieId))) { LOGGER.error("AEBN: id({}) is not a valid aebn id", movieId); } MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); sr.setId(movieId); sr.setIMDBId(""); sr.setTitle(movieName); sr.setOriginalTitle(movieName); // sr.setYear not possible, no data at this point sr.setYear(null); sr.setMediaType(MediaType.MOVIE); sr.setUrl(movieUrl); sr.setPosterUrl(posterUrl); // compare score based on names float score = MetadataUtil.calculateScore(searchString, movieName); if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) { LOGGER.debug("AEBN: no poster - downgrading score by 0.01"); score = score - 0.01f; } sr.setScore(score); // check if result has at least a title and id if (StringUtils.isBlank(sr.getTitle()) || StringUtils.isBlank(sr.getId())) { LOGGER.warn("AEBN: no title nor id, skipping"); continue; } // check if the movie has been already added to the search results if (foundResultUrls.contains(sr.getUrl())) { continue; } foundResultUrls.add(sr.getUrl()); // populate extra arguments (deprecated) // MetadataUtil.copySearchQueryToSearchResult(query, sr); resultList.add(sr); } catch (Exception e) { LOGGER.warn("AEBN: error parsing search result: {}", e); } } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }
From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java
/** * Get movie meta data from aebn.net.//from w ww .ja v a2s. co m * */ @Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("AEBN: getMetadata() {}", options); // check if there is already meta data present in the result if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) { LOGGER.debug("AEBN: return metadata from cache"); return options.getResult().getMediaMetadata(); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); Elements elements = null; Element element = null; Integer aebnId = 0; // get AebnId from previous search result if ((options.getResult() != null) && (options.getResult().getId() != null)) { aebnId = Integer.parseInt(options.getResult().getId()); LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId); // preset some values from search result (if there is one) // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy". md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, StrgUtils.removeCommonSortableName(options.getResult().getTitle())); } // or get AebnId from options if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) { LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID)); aebnId = Integer.parseInt(options.getId(AEBNID)); } if (!isValidAebnId(aebnId)) { LOGGER.warn("AEBN: no or incorrect aebnId, aborting"); return md; } // ID md.setId(providerInfo.getId(), aebnId); LOGGER.debug("AEBN: aebnId({})", aebnId); // Base download url for data scraping String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId; String locale = options.getLanguage().name(); if (!StringUtils.isBlank(locale)) { downloadUrl = downloadUrl + "&locale=" + locale; LOGGER.debug("AEBN: used locale({})", locale); } // begin download and scrape try { LOGGER.debug("AEBN: download movie detail page"); Url url = new Url(downloadUrl); InputStream in = url.getInputStream(); Document document = Jsoup.parse(in, "UTF-8", ""); in.close(); // Title // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1> LOGGER.debug("AEBN: parse title"); elements = document.getElementsByAttributeValue("class", "md-movieTitle"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieTitle = cleanString(element.text()); LOGGER.debug("AEBN: title({})", movieTitle); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // Poster // front cover: // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg"; md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl); // Fanart/Background // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..." // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." /> LOGGER.debug("AEBN: parse fanart / scene thumbs"); elements = document.getElementsByAttributeValue("class", "SceneThumbnail"); LOGGER.debug("AEBN: {} elements found", elements.size()); int i = 1; for (Element anchor : elements) { String backgroundUrl = anchor.attr("src"); LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl); md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl); i++; } // Runtime LOGGER.debug("AEBN: parse runtime"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieRuntime = cleanString(element.attr("content")); movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M"); LOGGER.debug("AEBN: runtime({})", movieRuntime); md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime); } // Year LOGGER.debug("AEBN: parse year"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]"); if (elements.size() > 0) { LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)"); element = elements.first(); String movieYear = cleanString(element.attr("content")); movieYear = StrgUtils.substr(movieYear, "(\\d+)-"); LOGGER.debug("AEBN: year({})", movieYear); md.storeMetadata(MediaMetadata.YEAR, movieYear); } // Series (Collection) LOGGER.debug("AEBN: parse collection"); elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); element = elements.first(); String movieCollection = cleanString(element.text()); // Fake a TMDB_SET based on the hash value of the collection name int movieCollectionHash = movieCollection.hashCode(); md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection); md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash); LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash); } // Studio LOGGER.debug("AEBN: parse studio"); elements = document.getElementsByAttributeValue("id", "md-details") .select("[itemprop=productionCompany]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String movieStudio = cleanString(elements.first().text()); LOGGER.debug("AEBN: studio({})", movieStudio); md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio); } // Genre LOGGER.debug("AEBN: parse genre"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]"); for (Element g : elements) { md.addGenre(getTmmGenre(g.text())); } // add basic genre, since all genres at AEBN could be summarised // into this one md.addGenre(MediaGenres.EROTIC); // Certification // no data scrapeable---but obviously it's adult only, so simply // generate it String movieCertification = null; Certification certification = null; String country = options.getCountry().getAlpha2(); LOGGER.debug("AEBN: generate certification for {}", country); // @formatter:off if (country.equals("DE")) { movieCertification = "FSK 18"; } if (country.equals("US")) { movieCertification = "NC-17"; } if (country.equals("GB")) { movieCertification = "R18"; } if (country.equals("FR")) { movieCertification = "18"; } if (country.equals("ES")) { movieCertification = "PX"; } if (country.equals("JP")) { movieCertification = "R18+"; } if (country.equals("IT")) { movieCertification = "V.M.18"; } if (country.equals("NL")) { movieCertification = "16"; } // @formatter:on certification = Certification.getCertification(options.getCountry(), movieCertification); if (certification != null) { LOGGER.debug("AEBN: certification({})", certification); md.addCertification(certification); } // Plot and Tagline LOGGER.debug("AEBN: parse plot"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String moviePlot = cleanString(elements.first().text()); md.storeMetadata(MediaMetadata.PLOT, moviePlot); // no separate tagline available, so extract the first sentence // from the movie plot String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])"); LOGGER.debug("AEBN: tagline(" + movieTagline + ")"); md.storeMetadata(MediaMetadata.TAGLINE, movieTagline); } // Actors LOGGER.debug("AEBN: parse actors"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]"); LOGGER.debug("AEBN: {} actors found", elements.size()); for (Element anchor : elements) { String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)"); String actorname = cleanString(anchor.select("[itemprop=name]").first().text()); String actordetailsurl = BASE_DATAURL + anchor.attr("href"); if (!actorname.isEmpty()) { LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname, actordetailsurl); MediaCastMember cm = new MediaCastMember(); cm.setType(MediaCastMember.CastType.ACTOR); cm.setName(actorname); if (!actorid.isEmpty()) { cm.setId(actorid); } // Actor detail page try { Url starurl = new Url(actordetailsurl); InputStream starurlstream = starurl.getInputStream(); Document stardocument = Jsoup.parse(starurlstream, "UTF-8", ""); starurlstream.close(); Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo"); if (elements2.size() == 0) { LOGGER.debug("AEBN: no additional actor details found"); } else { // Actor image String actorimage = elements2.select("[itemprop=image]").first().attr("src"); LOGGER.debug("AEBN: actor image({})", actorimage); if (!actorimage.isEmpty()) { cm.setImageUrl(actorimage); } // Actor 'fanart' images // unsure if this is ever shown in tmm elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery") .select("a"); LOGGER.debug("AEBN: {} gallery images found", elements2.size()); for (Element thumbnail : elements2) { LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href")); cm.addFanart(thumbnail.attr("href")); } } } catch (Exception e) { LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e); } md.addCastMember(cm); } } // Director LOGGER.debug("AEBN: parse director"); elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]"); if (elements.size() > 0) { LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size()); String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)"); String directorname = cleanString(elements.select("[itemprop=name]").first().text()); if (!directorname.isEmpty()) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(directorname); if (!directorid.isEmpty()) { cm.setId(directorid); } cm.setImageUrl(""); md.addCastMember(cm); LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname); } } // Original Title // if we have no original title, just copy the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } } catch (Exception e) { LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e); } return md; }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
private void parseCast(Elements el, MediaCastMember.CastType type, MediaMetadata md) { if (el != null && !el.isEmpty()) { Element castEl = null;/* w ww. j a va2 s.c o m*/ for (Element element : el) { if (!element.tagName().equals("option")) { // we get more, just do not take the optionbox castEl = element; } } if (castEl == null) { LOGGER.debug("meh, no " + type.name() + " found"); return; } // walk up to table TR... while (!((castEl == null) || (castEl.tagName().equalsIgnoreCase("tr")))) { castEl = castEl.parent(); } // ... and take the next table row ^^ Element tr = castEl.nextElementSibling(); if (tr != null) { for (Element a : tr.getElementsByAttributeValue("valign", "middle")) { String act = a.toString(); String aname = StrgUtils.substr(act, "alt=\"(.*?)\""); if (!aname.isEmpty()) { MediaCastMember cm = new MediaCastMember(); cm.setName(aname); String id = StrgUtils.substr(act, "id=(.*?)[^\"]\">"); if (!id.isEmpty()) { cm.setId(id); // thumb // http://www.ofdb.de/thumbnail.php?cover=images%2Fperson%2F7%2F7689.jpg&size=6 // fullsize ;) http://www.ofdb.de/images/person/7/7689.jpg try { String imgurl = URLDecoder .decode(StrgUtils.substr(act, "images%2Fperson%2F(.*?)&size"), "UTF-8"); if (!imgurl.isEmpty()) { imgurl = BASE_URL + "/images/person/" + imgurl; } cm.setImageUrl(imgurl); } catch (Exception e) { } } String arole = StrgUtils.substr(act, "\\.\\.\\. (.*?)</font>").replaceAll("<[^>]*>", ""); cm.setCharacter(arole); cm.setType(type); md.addCastMember(cm); } } } } }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception { LOGGER.debug("search() " + options.toString()); if (options.getMediaType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getMediaType()); }/* w w w .j a v a 2 s . c o m*/ List<MediaSearchResult> resultList = new ArrayList<>(); String searchString = ""; String searchQuery = ""; String imdb = ""; Elements filme = null; int myear = options.getYear(); /* * Kat = All | Titel | Person | DTitel | OTitel | Regie | Darsteller | Song | Rolle | EAN| IMDb | Google * http://www.ofdb.de//view.php?page=suchergebnis &Kat=xxxxxxxxx&SText=yyyyyyyyyyy */ // 1. search with imdbId if (StringUtils.isNotEmpty(options.getImdbId()) && (filme == null || filme.isEmpty())) { try { imdb = options.getImdbId(); searchString = BASE_URL + "/view.php?page=suchergebnis&Kat=IMDb&SText=" + imdb; LOGGER.debug("search with imdbId: " + imdb); Url url = new Url(searchString); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); // only look for movie links filme = doc.getElementsByAttributeValueMatching("href", "film\\/\\d+,"); LOGGER.debug("found " + filme.size() + " search results"); } catch (Exception e) { LOGGER.error("failed to search for imdb Id " + imdb + ": " + e.getMessage()); } } // 2. search for search string if (StringUtils.isNotEmpty(options.getQuery()) && (filme == null || filme.isEmpty())) { try { String query = options.getQuery(); searchQuery = query; query = MetadataUtil.removeNonSearchCharacters(query); searchString = BASE_URL + "/view.php?page=suchergebnis&Kat=All&SText=" + URLEncoder.encode(cleanSearch(query), "UTF-8"); LOGGER.debug("search for everything: " + query); Url url = new Url(searchString); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); // only look for movie links filme = doc.getElementsByAttributeValueMatching("href", "film\\/\\d+,"); LOGGER.debug("found " + filme.size() + " search results"); } catch (Exception e) { LOGGER.error("failed to search for " + searchQuery + ": " + e.getMessage()); } } if (filme == null || filme.isEmpty()) { LOGGER.debug("nothing found :("); return resultList; } // <a href="film/22523,Die-Bourne-Identitt" // onmouseover="Tip('<img src="images/film/22/22523.jpg" // width="120" height="170">',SHADOW,true)">Bourne // Identitt, Die<font size="1"> / Bourne Identity, The</font> (2002)</a> HashSet<String> foundResultUrls = new HashSet<>(); for (Element a : filme) { try { MediaSearchResult sr = new MediaSearchResult(providerInfo.getId(), MediaType.MOVIE); if (StringUtils.isNotEmpty(imdb)) { sr.setIMDBId(imdb); } sr.setId(StrgUtils.substr(a.toString(), "film\\/(\\d+),")); // OFDB ID sr.setTitle(StringEscapeUtils.unescapeHtml4(StrgUtils .removeCommonSortableName(StrgUtils.substr(a.toString(), ".*>(.*?)(\\[.*?\\])?<font")))); LOGGER.debug("found movie " + sr.getTitle()); sr.setOriginalTitle(StringEscapeUtils.unescapeHtml4( StrgUtils.removeCommonSortableName(StrgUtils.substr(a.toString(), ".*> / (.*?)</font")))); try { sr.setYear(Integer.parseInt(StrgUtils.substr(a.toString(), "font> \\((.*?)\\)<\\/a"))); } catch (Exception ignored) { } sr.setUrl(BASE_URL + "/" + StrgUtils.substr(a.toString(), "href=\\\"(.*?)\\\"")); sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), "images(.*?)\\"")); // check if it has at least a title and url if (StringUtils.isBlank(sr.getTitle()) || StringUtils.isBlank(sr.getUrl())) { continue; } // OFDB could provide linke twice - check if that has been already added if (foundResultUrls.contains(sr.getUrl())) { continue; } foundResultUrls.add(sr.getUrl()); if (imdb.equals(sr.getIMDBId())) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchQuery, sr.getTitle()); if (yearDiffers(myear, sr.getYear())) { float diff = (float) Math.abs(myear - sr.getYear()) / 100; LOGGER.debug( "parsed year does not match search result year - downgrading score by " + diff); score -= diff; } sr.setScore(score); } resultList.add(sr); } catch (Exception e) { LOGGER.warn("error parsing movie result: " + e.getMessage()); } } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }
From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // we can only work further if we got a search result on zelluloid.de if (options.getResult() == null) { throw new Exception("Scrape with Zelluloid.de without prior search is not supported"); }// ww w. jav a2 s .c om MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; // preset values from searchresult (if we have them) md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, Utils.removeSortableName(options.getResult().getOriginalTitle())); md.storeMetadata(MediaMetadata.TITLE, Utils.removeSortableName(options.getResult().getTitle())); md.storeMetadata(MediaMetadata.YEAR, options.getResult().getYear()); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, options.getResult().getOriginalTitle()); String id = ""; if (StringUtils.isEmpty(options.getResult().getId())) { id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)"); } else { id = options.getResult().getId(); } String detailurl = options.getResult().getUrl(); if (StringUtils.isEmpty(detailurl)) { detailurl = BASE_URL + "/filme/index.php3?id=" + id; } Url url; try { LOGGER.debug("get details page"); url = new CachedUrl(detailurl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); // parse plot String plot = doc.getElementsByAttributeValue("class", "bigtext").text(); md.storeMetadata(MediaMetadata.PLOT, plot); md.storeMetadata(MediaMetadata.TAGLINE, plot.length() > 150 ? plot.substring(0, 150) : plot); // parse poster el = doc.getElementsByAttributeValueStarting("src", "/images/poster"); if (el.size() == 1) { md.storeMetadata(MediaMetadata.POSTER_URL, BASE_URL + el.get(0).attr("src")); } // parse year if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.YEAR))) { el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { md.storeMetadata(MediaMetadata.YEAR, el.get(0).text()); } } // parse cinema release el = doc.getElementsByAttributeValueContaining("href", "?v=w"); if (el.size() > 0) { try { SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy"); Date d = sdf.parse(el.get(0).text()); sdf = new SimpleDateFormat("yyyy-MM-dd"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(d)); } catch (Exception e) { LOGGER.warn("cannot parse cinema release date: " + el.get(0).text()); } } // parse original title if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<")); } if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } // parse runtime String rt = (StrgUtils.substr(doc.toString(), "ca. (.*?) min")); if (!rt.isEmpty()) { try { md.storeMetadata(MediaMetadata.RUNTIME, Integer.valueOf(rt)); } catch (Exception e2) { LOGGER.warn("cannot convert runtime: " + rt); } } // parse genres el = doc.getElementsByAttributeValueContaining("href", "az.php3?g="); for (Element g : el) { String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1); md.addGenre(getTmmGenre(gid)); } // parse cert // FSK: ab 12, $230 Mio. Budget String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]"); if (!fsk.isEmpty()) { md.addCertification(Certification.findCertification(fsk)); } // parse rating Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable"); if (ratings.size() == 2) { // get user rating Element e = ratings.get(1); // <div>87%</div> String r = e.getElementsByTag("div").text().replace("%", ""); try { md.storeMetadata(MediaMetadata.RATING, Double.valueOf(r) / 10); // only 0-10 } catch (Exception e2) { LOGGER.warn("cannot convert rating: " + r); } } // details page doc = null; String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id; try { url = new CachedUrl(detailsUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get details: " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(detailsUrl); } if (doc != null) { Element tab = doc.getElementById("ccdetails"); int header = 0; String lastRole = ""; for (Element tr : tab.getElementsByTag("tr")) { if (tr.toString().contains("dyngfx")) { // header gfx if (tr.toString().contains("Besetzung")) { header = 1; } else if (tr.toString().contains("Crew")) { header = 2; } else if (tr.toString().contains("Produktion")) { header = 3; } else if (tr.toString().contains("Verleih")) { header = 4; } else if (tr.toString().contains("Alternativtitel")) { header = 5; } continue; } else { // no header gfx, so data MediaCastMember mcm = new MediaCastMember(); el = tr.getElementsByTag("td"); if (header == 1) { // actors if (el.size() == 2) { mcm.setCharacter(el.get(0).text()); mcm.setName(el.get(1).getElementsByTag("a").text()); mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); mcm.setType(MediaCastMember.CastType.ACTOR); // System.out.println("Cast: " + mcm.getCharacter() + " - " + // mcm.getName()); md.addCastMember(mcm); // TODO: parse actor detail pages :/ } } else if (header == 2) { // crew if (el.size() == 2) { String crewrole = el.get(0).html().trim(); mcm.setName(el.get(1).getElementsByTag("a").text()); if (crewrole.equals(" ")) { mcm.setPart(lastRole); } else { mcm.setPart(crewrole); lastRole = crewrole; } if (crewrole.equals("Regie")) { mcm.setType(MediaCastMember.CastType.DIRECTOR); } else if (crewrole.equals("Drehbuch")) { mcm.setType(MediaCastMember.CastType.WRITER); } else { mcm.setType(MediaCastMember.CastType.OTHER); } mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); // System.out.println("Crew: " + mcm.getPart() + " - " + // mcm.getName()); md.addCastMember(mcm); } } else if (header == 3) { // production md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, el.get(0).text()); } } } } // get links page doc = null; String linksUrl = BASE_URL + "/filme/links.php3?id=" + id; try { url = new CachedUrl(linksUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get links page: " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(linksUrl); } if (doc != null) { el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com"); if (el != null && el.size() > 0) { String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})"); if (imdb.isEmpty()) { imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)"); } md.setId(MediaMetadata.IMDBID, imdb); } } } catch (Exception e) { LOGGER.error("Error parsing " + options.getResult().getUrl()); // clear cache CachedUrl.removeCachedFileForUrl(detailurl); throw e; } return md; }
From source file:SeedGenerator.MainForm.java
private void calculateEndpointTagsTF() { try {/*from ww w.j a v a 2 s . c om*/ try { PreparedStatement createtablepstmt = con.prepareStatement( "CREATE TABLE `recommender_endpoints_tf` (\n" + " `id` int(11) NOT NULL AUTO_INCREMENT,\n" + " `word` varchar(45) DEFAULT NULL,\n" + " `endpointid` int(11) DEFAULT NULL,\n" + " `queryid` int(11) DEFAULT NULL,\n" + " `count` int(11) DEFAULT NULL,\n" + " `idf` int(11) DEFAULT NULL,\n" + " `idfid` int(11) DEFAULT NULL,\n" + " `totalNumberofWords` int(11) DEFAULT NULL,\n" + " PRIMARY KEY (`id`),\n" + " KEY `word` (`word`),\n" + " KEY `idf` (`idfid`),\n" + " KEY `endpointid` (`endpointid`)\n" + ") ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;"); createtablepstmt.execute(); } catch (Exception ex) { } PreparedStatement pstmtendpoints = con.prepareStatement( "SELECT * from crawler.endpoints where sourceCodeHTML is not null and source != 'spendold' and source != 'spendnew' group by domain ORDER BY id asc;"); ResultSet rs1 = pstmtendpoints.executeQuery(); String htmlsource = ""; HashMap<String, Integer> wordCount = new HashMap(); while (rs1.next()) { htmlsource = rs1.getString("sourceCodeHTML"); HashMap<String, Integer> localwordCount = new HashMap(); org.jsoup.nodes.Document doc = Jsoup.parse(htmlsource);//.connect("http://en.wikipedia.org/").get(); //Elements newsHeadlines = doc.select("#mp-itn b a"); Elements links = doc.getElementsByTag("a"); Elements labels = doc.getElementsByTag("Label"); Elements spans = doc.getElementsByTag("span"); Elements titles = doc.getElementsByTag("title"); Elements meta = doc.getElementsByTag("meta"); Elements h2 = doc.getElementsByTag("h2"); Elements h1 = doc.getElementsByTag("h1"); Elements h3 = doc.getElementsByTag("h3"); Elements li = doc.getElementsByTag("li"); Elements dt = doc.getElementsByTag("dt"); Elements p = doc.getElementsByTag("p"); Elements option = doc.getElementsByTag("option"); links.addAll(labels); links.addAll(spans); links.addAll(titles); links.addAll(meta); links.addAll(h2); links.addAll(h1); links.addAll(h3); links.addAll(li); links.addAll(dt); links.addAll(p); links.addAll(option); for (Element link : links) { String word = link.toString(); if (wordCount.containsKey(word) && !localwordCount.containsKey(word)) { wordCount.replace(word, wordCount.get(word) + 1); } else if (!wordCount.containsKey(word)) { wordCount.put(word, 1); } if (localwordCount.containsKey(word)) { // wordCount.replace(word, wordCount.get(word) + 1); } else { localwordCount.put(word, 1); } String linkHref = link.attr("href"); // String linkText = link.text(); } // // String words[] = htmlsource.split("\n");//\\s+"); // for (String word : words) { // String cleanword; // // cleanword = word.replaceAll("\r", "");//"[^\\p{L}\\p{Nd}]+", ""); // if (!cleanword.equals("")) { // if (!word.equals(cleanword)) { // word = cleanword;//System.out.println(word+"--"+cleanword); // } // word = word.toLowerCase().replace("", "i"); // // if (wordCount.containsKey(word) && !localwordCount.containsKey(word)) { // wordCount.replace(word, wordCount.get(word) + 1); // } else if (!wordCount.containsKey(word)) { // wordCount.put(word, 1); // } // // if (localwordCount.containsKey(word)) { //// wordCount.replace(word, wordCount.get(word) + 1); // } else { // localwordCount.put(word, 1); // } // // } else { // } // // } } pstmtendpoints.close(); rs1.close(); Iterator it = wordCount.entrySet().iterator(); while (it.hasNext()) { Map.Entry pair = (Map.Entry) it.next(); if (Integer.parseInt(pair.getValue().toString()) > 1) { PreparedStatement insertpstmt = con .prepareStatement("insert into recommender_endpoints_tf (word,count) values(?,?);"); if (pair.getKey().toString().length() > 44) { insertpstmt.setString(1, pair.getKey().toString()); } else { insertpstmt.setString(1, pair.getKey().toString()); } insertpstmt.setInt(2, Integer.parseInt(pair.getValue().toString())); insertpstmt.executeUpdate(); insertpstmt.close(); } it.remove(); // avoids a ConcurrentModificationException } } catch (Exception e) { //System.err.println("Got an exception! "); System.err.println(e.getMessage()); } }