List of usage examples for org.jsoup.nodes Element hasClass
public boolean hasClass(String className)
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public DetailledItem getResultById(String id, final String homebranch) throws IOException { if (sessid == null) { start();/*from w ww. java2s . c o m*/ } // Homebranch if (homebranch != null && !"".equals(homebranch)) { cookieStore.addCookie(new BasicClientCookie("zweig", homebranch)); } String html = httpGet(opac_url + "/titel.cgi?katkey=" + id + "&sess=" + sessid, ENCODING, false, cookieStore); Document doc = Jsoup.parse(html); DetailledItem item = new DetailledItem(); item.setId(id); Elements table = doc.select(".titelsatz tr"); for (Element tr : table) { if (tr.select("th").size() == 0 || tr.select("td").size() == 0) { continue; } String d = tr.select("th").first().text(); String c = tr.select("td").first().text(); if (d.equals("Titel:")) { item.setTitle(c); } else if ((d.contains("URL") || d.contains("Link")) && tr.select("td a").size() > 0) { item.addDetail(new Detail(d, tr.select("td a").first().attr("href"))); } else { item.addDetail(new Detail(d, c)); } } if (doc.select(".ex table tr").size() > 0) { table = doc.select(".ex table tr"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (Element tr : table) { if (tr.hasClass("exueber") || tr.select(".exsig").size() == 0 || tr.select(".exso").size() == 0 || tr.select(".exstatus").size() == 0) { continue; } Copy copy = new Copy(); copy.setShelfmark(tr.select(".exsig").first().text()); copy.setBranch(tr.select(".exso").first().text()); String status = tr.select(".exstatus").first().text(); if (status.contains("entliehen bis")) { copy.setReturnDate(fmt.parseLocalDate(status.replaceAll("entliehen bis ([0-9.]+) .*", "$1"))); copy.setReservations(status.replaceAll(".*\\(.*Vormerkungen: ([0-9]+)\\)", "$1")); copy.setStatus("entliehen"); } else { copy.setStatus(status); } item.addCopy(copy); } } for (Element a : doc.select(".status1 a")) { if (a.attr("href").contains("bestellung.cgi")) { item.setReservable(true); item.setReservation_info(id); break; } } for (Element a : doc.select(".titelsatz a")) { if (a.text().trim().matches("B.+nde")) { Map<String, String> volumesearch = new HashMap<>(); volumesearch.put("query", getQueryParamsFirst(a.attr("href")).get("query")); item.setVolumesearch(volumesearch); } } return item; }
From source file:com.weavers.duqhan.business.impl.ProductServiceImpl.java
@Override public void loadTempProducts(List<StatusBean> statusBeans) { boolean isSuccess = true; String startDate = new Date().toString(); Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==I==)DATE: " + startDate + "Store product details in temp product table start....."); try {/*w w w . j a va 2s. co m*/ String status = ""; for (StatusBean statusBean : statusBeans) { status = "Link duplicate"; Temtproductlinklist temtproductlinklist = temtproductlinklistDao.loadById(statusBean.getId()); if (temtproductlinklist != null && temtproductlinklist.getStatus() == 0) { Product testProduct = productDao.getProductByExternelLink(temtproductlinklist.getLink()); if (testProduct == null) { String value = ""; Elements detailMain; Elements detailSub; Elements specifics; double votes = 0.0; double stars = 0.0; double feedback = 0.0; String url = temtproductlinklist.getLink(); try { testProduct = new Product(); Product savedTestProduct; //=================== Random sleep START ===================// // TimeUnit.SECONDS.sleep(30 + (int) (Math.random() * 100)); Random randomObj = new Random(); TimeUnit.SECONDS.sleep(randomObj.ints(30, 60).findFirst().getAsInt()); //=================== Random sleep END =====================// Document doc = Jsoup.connect(url).get(); detailMain = doc.select("#j-detail-page"); if (!detailMain.isEmpty()) { //=================== Criteria Block START==================// detailMain = doc.select(".rantings-num"); if (!detailMain.isEmpty()) { votes = Double.valueOf(detailMain.text().split(" votes")[0].split("\\(")[1]); } detailMain = doc.select(".percent-num"); if (!detailMain.isEmpty()) { stars = Double.valueOf(detailMain.text()); } detailMain = doc.select("ul.ui-tab-nav li[data-trigger='feedback'] a"); if (!detailMain.isEmpty()) { feedback = Double.valueOf(detailMain.text().split("\\(")[1].split("\\)")[0]); } //=================== Criteria Block END==================// if (votes > 10.0 && stars > 4.0 && feedback > 4.0) { detailMain = doc.select(".detail-wrap .product-name"); testProduct.setName(detailMain .text());/*.substring(0, Math.min(detailMain.text().length(), 50))*/ detailMain = doc.select(".detail-wrap .product-name"); testProduct.setDescription(detailMain.text()); testProduct.setExternalLink(url); testProduct.setVendorId(1l);//?????????????????????? //=================== Packaging block START==================// Double weight = 1.0; Double width = 1.0; Double height = 1.0; Double length = 1.0; detailMain = doc.select( "div#j-product-desc div.pnl-packaging-main ul li.packaging-item"); for (Element element : detailMain) { String packagingTitle = element.select("span.packaging-title").text(); String packagingDesc = element.select("span.packaging-des").text(); if (packagingTitle.trim().equals("Package Weight:")) { String str = packagingDesc; str = str.replaceAll("[^.?0-9]+", " "); if (Arrays.asList(str.trim().split(" ")) != null) { if (!Arrays.asList(str.trim().split(" ")).isEmpty()) { try { weight = Double.parseDouble( Arrays.asList(str.trim().split(" ")).get(0)); } catch (Exception e) { weight = 1.0; } } } System.out.println("weight == " + weight); } else if (packagingTitle.trim().equals("Package Size:")) { String str = packagingDesc; str = str.replaceAll("[^.?0-9]+", " "); if (Arrays.asList(str.trim().split(" ")) != null) { if (!Arrays.asList(str.trim().split(" ")).isEmpty()) { try { width = Double.parseDouble( Arrays.asList(str.trim().split(" ")).get(0)); height = Double.parseDouble( Arrays.asList(str.trim().split(" ")).get(1)); length = Double.parseDouble( Arrays.asList(str.trim().split(" ")).get(2)); } catch (Exception e) { width = 1.0; height = 1.0; length = 1.0; } } } System.out.println("width == " + width); System.out.println("height == " + height); System.out.println("length == " + length); } } //=================== Packaging block END==================// //=================== Category block START==================// detailMain = doc.select("div.ui-breadcrumb div.container a"); Long productCategoryId = 0L; String parentPath = ""; String thisCategory = detailMain.last().text().trim(); System.out.println("thisCategory == " + thisCategory); Category parentCategory = new Category(); parentCategory.setId(0L); parentCategory.setParentPath(""); for (Element element : detailMain) { String newCategory; newCategory = element.text().trim(); System.out.println("newCategory======" + newCategory); if (newCategory.equals("Home") || newCategory.equals("All Categories")) { } else { Category category = categoryDao.getCategoryByName(newCategory); if (category != null) { if (category.getName().equals(thisCategory)) { productCategoryId = category.getId(); parentPath = category.getParentPath(); } parentCategory = category; } else { category = new Category(); category.setId(null); category.setName(newCategory); category.setParentId(parentCategory.getId()); category.setParentPath(parentCategory.getParentPath() + parentCategory.getId() + "="); category.setQuantity(0); category.setImgUrl("-"); category.setDisplayText(newCategory); Category category2 = categoryDao.save(category); if (category.getName().equals(thisCategory)) { productCategoryId = category2.getId(); parentPath = category2.getParentPath(); } parentCategory = category2; } } } //=================== Category block END==================// //=============== Specifications block START==============// detailMain = doc.select(".product-property-list .property-item"); String specifications = ""; for (Element element : detailMain) { specifications = specifications + element.select(".propery-title").get(0).text().replace(",", "/") .replace(":", "-") + ":" + element.select(".propery-des").get(0).text() .replace(",", "/").replace(":", "-") + ",";//TODO:, check } //=============== Specifications Block END==============// //=============== Shipping Time Block START==============// String shippingTime = ""; detailMain = doc.select(".shipping-days[data-role='delivery-days']"); System.out.println("value detailMain" + detailMain.toString()); shippingTime = detailMain.text(); //=============== Shipping Time Block END==============// //=============== Shipping Cost Block START==============// detailMain = doc.select(".logistics-cost"); value = detailMain.text(); if (!value.equalsIgnoreCase("Free Shipping")) { // f = 0.00; } else { // f = Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1")); } //=============== Shipping Cost Block END==============// //=================Product save 1st START==============// testProduct.setCategoryId(productCategoryId); testProduct.setLastUpdate(new Date()); testProduct.setParentPath(parentPath); testProduct.setImgurl("-"); testProduct.setProperties("-"); testProduct.setProductWidth(width); testProduct.setProductLength(length); testProduct.setProductWeight(weight); testProduct.setProductHeight(height); testProduct.setShippingRate(0.0); testProduct.setShippingTime("45"); testProduct.setSpecifications(specifications); savedTestProduct = productDao.save(testProduct); //====================Product save 1st END==============// //========= Property, Property Value, Property Product Map Block START ========// double discountPrice = 0.0; double actualPrice = 0.0; double markupPrice = 0.0; String id = ""; String allProperties = ""; //------------------------Read Color css START---------------------// specifics = doc.select("#j-product-info-sku dl.p-property-item"); Elements cssdetailMain = doc.select("link[href]"); Document cssdoc = new Document(""); System.out.println( "====================================================cssdetailMain" + cssdetailMain.size()); for (Element element : cssdetailMain) { String cssurl = element.attr("abs:href"); if (cssurl.contains("??main-detail")) { try { cssdoc = Jsoup.connect(cssurl).get(); } catch (IOException ex) { } break; } } //-----------------------Read Color css END--------------------------// //-----------Product Property, Property Value START--------// Map<String, ProductPropertyvalues> propertyValuesMap = new HashMap<>(); if (!specifics.isEmpty()) { ProductProperties testPorperties; ProductProperties saveTestPorperties; ProductPropertyvalues testPropertyValues; for (Element specific : specifics) { System.out.println("head ==== " + specific.select("dt").text()); testPorperties = productPropertiesDao .loadByName(specific.select("dt").text()); if (testPorperties == null) { testPorperties = new ProductProperties(); testPorperties.setPropertyName(specific.select("dt").text()); saveTestPorperties = productPropertiesDao.save(testPorperties); } else { saveTestPorperties = testPorperties; } allProperties = allProperties + saveTestPorperties.getId().toString() + "-"; detailSub = specific.select("dd ul li"); String valu = "-"; for (Element element : detailSub) { testPropertyValues = new ProductPropertyvalues(); id = element.select("a[data-sku-id]").attr("data-sku-id").trim(); testPropertyValues.setRefId(id); if (element.hasClass("item-sku-image")) { valu = element.select("a img[src]").get(0).absUrl("src") .split(".jpg")[0] + ".jpg"; String title = element.select("a img").get(0).attr("title"); String imgUrl = GoogleBucketFileUploader .uploadProductImage(valu, savedTestProduct.getId()); valu = "<img src='" + imgUrl + "' title='" + title + "' style='height:40px; width:40px;'/>"; } else if (element.hasClass("item-sku-color")) { String style = cssdoc.html().split("sku-color-" + id)[1] .split("}")[0].substring(1); valu = "<span style='" + style + "' ; height:40px; width:40px; display:block;'></span>"; } else { valu = element.select("a span").toString(); } System.out.println("valu === " + valu); testPropertyValues.setProductId(savedTestProduct.getId()); testPropertyValues.setPropertyId(saveTestPorperties.getId()); testPropertyValues.setValueName(valu); propertyValuesMap.put(id, productPropertyvaluesDao.save(testPropertyValues)); } } savedTestProduct.setProperties(allProperties); } //-----------Product Property, Property Value END--------// //----------------------Read json START------------------// List<AxpProductDto> axpProductDtos = new ArrayList<>(); Elements scripts = doc.select("script"); // Get the script part for (Element script : scripts) { if (script.html().contains("var skuProducts=")) { String jsonData = ""; jsonData = script.html().split("var skuProducts=")[1] .split("var GaData")[0].trim(); jsonData = jsonData.substring(0, jsonData.length() - 1); Gson gsonObj = new Gson(); axpProductDtos = Arrays .asList(gsonObj.fromJson(jsonData, AxpProductDto[].class)); break; } } //----------------------Read json END------------------// //-------------Product Properties Map START------------// for (AxpProductDto thisAxpProductDto : axpProductDtos) { SkuVal skuVal = thisAxpProductDto.getSkuVal(); if (skuVal.getActSkuCalPrice() != null) { value = skuVal.getActSkuCalPrice().trim(); discountPrice = CurrencyConverter.usdTOinr( Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1"))); value = skuVal.getSkuCalPrice().trim(); actualPrice = CurrencyConverter.usdTOinr( Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1"))); markupPrice = discountPrice * 0.15 + 100; discountPrice = Math.ceil((discountPrice + markupPrice) / 10) * 10; actualPrice = Math.round(actualPrice + markupPrice); } else { discountPrice = 0.0; value = skuVal.getSkuCalPrice().trim(); actualPrice = CurrencyConverter.usdTOinr( Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1"))); markupPrice = actualPrice * 0.15 + 100; discountPrice = Math.round(actualPrice + markupPrice); actualPrice = Math.round(actualPrice + markupPrice); } ProductPropertiesMap productPropertyMap = new ProductPropertiesMap(); String myPropValueIds = ""; if (thisAxpProductDto.getSkuAttr() != null) { String[] skuPropIds = thisAxpProductDto.getSkuPropIds().split(","); for (String skuPropId : skuPropIds) { myPropValueIds = myPropValueIds + propertyValuesMap.get(skuPropId).getId().toString() + "_"; } productPropertyMap.setPropertyvalueComposition(myPropValueIds); } else { productPropertyMap.setPropertyvalueComposition("_"); } productPropertyMap.setDiscount(discountPrice); productPropertyMap.setPrice(actualPrice); productPropertyMap.setProductId(savedTestProduct); productPropertyMap.setQuantity(5l); productPropertiesMapDao.save(productPropertyMap); } //-------------Product Properties Map START------------// //========= Property, Property Value, Property Product Map Block END ========// //============= Multiple Image Block START =============// detailMain = doc.select("ul.image-thumb-list span.img-thumb-item img[src]"); int flg = 0; String imgUrl = ""; for (Element element : detailMain) { imgUrl = GoogleBucketFileUploader.uploadProductImage( element.absUrl("src").split(".jpg")[0] + ".jpg", savedTestProduct.getId()); if (flg == 0) { flg++; savedTestProduct.setImgurl(imgUrl); } else { ProductImg productImg = new ProductImg(); productImg.setId(null); productImg.setImgUrl(imgUrl); productImg.setProductId(savedTestProduct.getId()); productImgDao.save(productImg); } } //============= Multiple Image Block END =============// //=================Product save final START==============// if (productDao.save(savedTestProduct) != null) { temtproductlinklist.setStatus(1);// temtproductlinklistDao.save(temtproductlinklist); status = "Success"; } //=================Product save final START==============// } else { temtproductlinklist.setStatus(2);// temtproductlinklistDao.save(temtproductlinklist); status = "criteria mismatch"; } } else { status = "Page not found"; } } catch (Exception ex) { System.out.println( "=============================================================Exception1" + ex); temtproductlinklist.setStatus(4);// temtproductlinklistDao.save(temtproductlinklist); System.out.println("Exception === " + ex); status = "Failure"; Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==E==)DATE: " + new Date().toString() + "Store product details in temp product table get error in sub process.....\n Link Id: " + statusBean.getId() + "\n Started on" + startDate, ex); } } else { temtproductlinklist.setStatus(3);// temtproductlinklistDao.save(temtproductlinklist); status = "Product exsist"; } } // String body = "Id: " + temtproductlinklist.getId() + "<br/> Status: " + status; // MailSender.sendEmail("krisanu.nandi@pkweb.in", "Product captured", body, "subhendu.sett@pkweb.in"); statusBean.setStatus(status); } System.out.println("=============================================================status" + status); } catch (Exception e) { System.out.println("=============================================================Exception2" + e); isSuccess = false; String body = "(==E==)DATE: " + new Date().toString() + "Store product details in temp product table get error.....<br/> Started on" + startDate + "<br/>"; Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, body, e); // MailSender.sendEmail("krisanu.nandi@pkweb.in", "Stopped store product details", body + e.getLocalizedMessage(), "subhendu.sett@pkweb.in"); } if (isSuccess) { String body = "(==I==)DATE: " + new Date().toString() + "Store product details in temp product table end.....<br/> Started on" + startDate; Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, body); /*ObjectMapper mapper = new ObjectMapper(); try { MailSender.sendEmail("krisanu.nandi@pkweb.in", "Completed store product details", body + "=============<br/><br/>" + mapper.writeValueAsString(statusBeans), "subhendu.sett@pkweb.in"); } catch (JsonProcessingException ex) { Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, null, ex); }*/ } // return statusBeans; System.out.println("=============================================================end"); }
From source file:org.sakaiproject.nakamura.files.migrator.PageMigrator.java
void extractWidget(JSONObject originalStructure, String contentId, Set<String> widgetsUsed, String ref, JSONObject currentPage, JSONObject currentRow, int leftSideColumn, Element widgetElement) throws JSONException { String[] widgetIdParts = widgetElement.attr("id").split("_"); String widgetType = widgetIdParts[1]; String widgetId = widgetIdParts.length > 2 ? widgetIdParts[2] : generateWidgetId(); int columnIndex; if (widgetElement.hasClass("block_image_left")) { columnIndex = 0;//from w w w .j a v a 2 s .c o m } else if (widgetElement.hasClass("block_image_right")) { columnIndex = leftSideColumn > 0 ? 2 : 1; } else { columnIndex = leftSideColumn > 0 ? 1 : 0; } generateNewCell(widgetId, widgetType, currentPage, currentRow, columnIndex, getJSONObjectOrNull(originalStructure, widgetId)); widgetsUsed.add(widgetId); if ("discussion".equals(widgetType)) { migrateDiscussionWidget(contentId, ref, currentPage, widgetId); } widgetElement.remove(); }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMetadata(); }//from w ww. j av a 2s .co m MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(MediaMetadata.IMDBID, imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>( executor); // worker for imdb request (/combined) (everytime from akas.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/combined"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); Future<Document> futureCombined = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary = null; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureCombined.get(); /* * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // parse title and year Element title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title Elements elements = title.getElementsByTag("h1"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // year elements = title.getElementsByTag("span"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); // search year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); Matcher matcher = yearPattern.matcher(content); while (matcher.find()) { if (matcher.group(1) != null) { String movieYear = matcher.group(1); md.storeMetadata(MediaMetadata.YEAR, movieYear); break; } } } // original title elements = title.getElementsByAttributeValue("class", "title-extra"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); content = content.replaceAll("\\(original title\\)", "").trim(); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content); } } // poster Element poster = doc.getElementById("primary-poster"); if (poster != null) { String posterUrl = poster.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementById("tn15rating"); if (ratingElement != null) { Elements elements = ratingElement.getElementsByClass("starbar-meta"); if (elements.size() > 0) { Element div = elements.get(0); // rating comes in <b> tag Elements b = div.getElementsByTag("b"); if (b.size() == 1) { String ratingAsString = b.text(); Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10"); Matcher matcher = ratingPattern.matcher(ratingAsString); while (matcher.find()) { if (matcher.group(1) != null) { float rating = 0; try { rating = Float.valueOf(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.RATING, rating); break; } } } // count Elements a = div.getElementsByAttributeValue("href", "ratings"); if (a.size() == 1) { String countAsString = a.text().replaceAll("[.,]|votes", "").trim(); int voteCount = 0; try { voteCount = Integer.parseInt(countAsString); } catch (Exception e) { } md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount); } } // top250 elements = ratingElement.getElementsByClass("starbar-special"); if (elements.size() > 0) { Elements a = elements.get(0).getElementsByTag("a"); if (a.size() > 0) { Element anchor = a.get(0); Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(anchor.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { int top250 = 0; try { top250 = Integer.parseInt(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.TOP_250, top250); } } } } } // parse all items coming by <div class="info"> Elements elements = doc.getElementsByClass("info"); for (Element element : elements) { // only parse divs if (!"div".equals(element.tag().getName())) { continue; } // elements with h5 are the titles of the values Elements h5 = element.getElementsByTag("h5"); if (h5.size() > 0) { Element firstH5 = h5.first(); String h5Title = firstH5.text(); // release date /* * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline" * href="/title/tt0114746/releaseinfo" * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a> </div></div> */ if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element releaseDateElement = div.first(); String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", "")); Pattern pattern = Pattern.compile("(.*)\\(.*\\)"); Matcher matcher = pattern.matcher(releaseDate); if (matcher.find()) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy"); Date parsedDate = sdf.parse(matcher.group(1)); sdf = new SimpleDateFormat("dd-MM-yyyy"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate)); } catch (Exception e) { } } } } /* * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline" * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See * more</a> » </div></div> */ // tagline if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*") && !options.isScrapeImdbForeignLanguage()) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.storeMetadata(MediaMetadata.TAGLINE, tagline); } } /* * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick= * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a> » </div> */ // genres are only scraped from akas.imdb.com if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Elements a = div.first().getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/Sections/Genres/.*")) { md.addGenre(getTmmGenre(anchor.ownText())); } } } } // } /* * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div> */ // runtime // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String first = taglineElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.storeMetadata(MediaMetadata.RUNTIME, runtime); } } /* * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div> */ // country if (h5Title.matches("(?i)Country.*")) { Elements a = element.getElementsByTag("a"); String countries = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/country/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String country = matcher.group(1); if (StringUtils.isNotEmpty(countries)) { countries += ", "; } countries += country.toUpperCase(); } } md.storeMetadata(MediaMetadata.COUNTRY, countries); } /* * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div> */ // Spoken languages if (h5Title.matches("(?i)Language.*")) { Elements a = element.getElementsByTag("a"); String spokenLanguages = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/language/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String langu = matcher.group(1); if (StringUtils.isNotEmpty(spokenLanguages)) { spokenLanguages += ", "; } spokenLanguages += langu; } } md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages); } /* * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div> */ // certification // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { // certification for the right country if (anchor.attr("href").matches( "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) { Pattern certificationPattern = Pattern.compile(".*:(.*)"); Matcher matcher = certificationPattern.matcher(anchor.ownText()); Certification certification = null; while (matcher.find()) { if (matcher.group(1) != null) { certification = Certification.getCertification(options.getCountry(), matcher.group(1)); } } if (certification != null) { md.addCertification(certification); break; } } } } } /* * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick= * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div> */ // director if ("director-info".equals(element.id())) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } } /* * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick= * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src= * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick= * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick= * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick= * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table> */ // cast elements = doc.getElementsByClass("cast"); if (elements.size() > 0) { Elements tr = elements.get(0).getElementsByTag("tr"); for (Element row : tr) { Elements td = row.getElementsByTag("td"); MediaCastMember cm = new MediaCastMember(); for (Element column : td) { // actor thumb if (column.hasClass("hs")) { Elements img = column.getElementsByTag("img"); if (img.size() > 0) { String thumbUrl = img.get(0).attr("src"); if (thumbUrl.contains("no_photo.png")) { cm.setImageUrl(""); } else { thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", ""); cm.setImageUrl(thumbUrl); } } } // actor name if (column.hasClass("nm")) { cm.setName(cleanString(column.text())); } // character if (column.hasClass("char")) { cm.setCharacter(cleanString(column.text())); } } if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(CastType.ACTOR); md.addCastMember(cm); } } } Element content = doc.getElementById("tn15content"); if (content != null) { elements = content.getElementsByTag("table"); for (Element table : elements) { // writers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) { Elements anchors = table.getElementsByTag("a"); for (Element anchor : anchors) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.WRITER); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } // producers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { continue; } Elements columns = row.children(); if (columns.size() == 0) { continue; } MediaCastMember cm = new MediaCastMember(CastType.PRODUCER); String name = cleanString(columns.get(0).text()); if (StringUtils.isBlank(name)) { continue; } cm.setName(name); if (columns.size() >= 3) { cm.setPart(cleanString(columns.get(2).text())); } md.addCastMember(cm); } } } } // Production companies elements = doc.getElementsByClass("blackcatheader"); for (Element blackcatheader : elements) { if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) { Elements a = blackcatheader.nextElementSibling().getElementsByTag("a"); StringBuilder productionCompanies = new StringBuilder(); for (Element anchor : a) { if (StringUtils.isNotEmpty(productionCompanies)) { productionCompanies.append(", "); } productionCompanies.append(anchor.ownText()); } md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString()); break; } } /* * plot from /plotsummary */ // build the url doc = null; doc = futurePlotsummary.get(); // imdb.com has another site structure if (imdbSite == ImdbSiteDefinition.IMDB_COM) { Elements zebraList = doc.getElementsByClass("zebraList"); if (zebraList != null && !zebraList.isEmpty()) { Elements odd = zebraList.get(0).getElementsByClass("odd"); if (odd.isEmpty()) { odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even } if (odd.size() > 0) { Elements p = odd.get(0).getElementsByTag("p"); if (p.size() > 0) { String plot = cleanString(p.get(0).ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } } } // } // get data from tmdb? if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { MediaMetadata tmdbMd = futureTmdb.get(); if (options.isScrapeImdbForeignLanguage() && tmdbMd != null && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) { // tmdbid md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID)); // title md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE)); // original title md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE)); // tagline md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE)); // plot md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT)); // collection info md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); } if (options.isScrapeCollectionInfo() && tmdbMd != null) { md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } return md; }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void findRecommendations(@NotNull Document doc, @NotNull BilingualQueryResultBuilder resultBuilder) { // Determine all candidate nodes: Elements alternativeNodes = doc.select("div.cc > p > *"); Language currentLanguage = null;//from w w w.j ava 2s . c om for (Element node : alternativeNodes) { // If the next node is a flagicon, try to determine the language for the next entries from the class name if (node.tagName().equals("span") && node.hasClass("flagicon")) { Set<String> classNames = node.classNames(); classNames.remove("flagicon"); for (String className : classNames) { Language candidate = Language.getExistingLanguageById(className); if (candidate != null) { currentLanguage = candidate; break; } } } else if (node.tagName().equals("a")) { String recommendationText = node.text(); DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder(); objectBuilder.setLanguage(currentLanguage).setGeneralForm(recommendationText); resultBuilder.addSimilarRecommendation(objectBuilder.build()); } } }