List of usage examples for org.jsoup.nodes Element absUrl
public String absUrl(String attributeKey)
From source file:org.sbs.goodcrawler.plugin.extract.ExtractYouku.java
/** * ??/* w ww . j a va 2 s . c o m*/ * @param url * @return */ public HashMap<String, Object> getInformation(Page page) { HashMap<String, Object> map = Maps.newHashMap(); String url = page.getWebURL().getURL(); try { ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); Submitor.submitUrl(weburl); } } } if (url.contains("/show_page/")) { String title = doc.select(".title .name").text(); if (StringUtils.isBlank(title)) return null; map.put("title", title); String category = doc.select(".title .type a").text(); if (StringUtils.isBlank(category)) return null; map.put("category", category); String _year = CharMatcher.DIGIT.retainFrom(doc.select(".title .pub").text()); if (StringUtils.isNotBlank(_year)) { int year = Integer.parseInt(_year); map.put("year", year); } String score = CharMatcher.DIGIT.retainFrom(doc.select(".ratingstar .num").text()); map.put("score", score); String alias = doc.select(".alias").text(); if (alias.contains(":")) { map.put("translation", alias.split(":")[1]); } String img = doc.select(".thumb img").attr("src"); if (StringUtils.isBlank(img)) return null; map.put("thumbnail", img); String area = doc.select(".row2 .area a").text(); if (StringUtils.isBlank(area)) return null; map.put("area", area); String[] type = doc.select(".row2 .type a").text().split(" "); if (null == type || type.length == 0) return null; map.put("type", Sets.newHashSet(type)); String director = doc.select(".row2 .director a").text(); map.put("director", director); String _duration = CharMatcher.DIGIT.retainFrom(doc.select(".row2 .duration").text()); if (StringUtils.isNotBlank(_duration)) { int duration = Integer.parseInt(_duration); map.put("duration", duration); } String _hot = CharMatcher.anyOf(",").removeFrom(doc.select(".row2 .vr .num").text()); _hot = CharMatcher.DIGIT.retainFrom(_hot); if (StringUtils.isNotBlank(_hot)) { int hot = Integer.parseInt(_hot); map.put("hot", hot); } String sumary = doc.select(".detail .long").text(); map.put("summary", sumary); // Elements elements = doc.select(".baseaction a"); HashMap<String, String> playList = Maps.newHashMap(); for (Element element : elements) { String n = element.text(); String urlString = element.attr("href"); if (StringUtils.isBlank(urlString)) return null; Document d2 = Jsoup.parse(new URL(urlString), 10000); if (null != d2) { String x = d2.select("#link2").attr("value"); if (StringUtils.isBlank(x)) return null; playList.put(n, x); } } map.put("online", playList); } else if (url.contains("/v_show/")) { Document d3 = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); // ???Url?Url Elements links2 = d3.getElementsByTag("a"); if (!links2.isEmpty()) { for (Element link : links2) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); try { pendingUrls.addUrl(weburl); } catch (QueueException e) { log.error(e.getMessage()); } } } } String p = d3.select("h1.title a").attr("href"); if (StringUtils.isBlank(p)) return null; return getInformation(p); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } if (map != null && map.size() > 4) { if (null == map.get("year")) { map.put("year", 1800); } } return map; }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractYouku.java
private HashMap<String, Object> getInformation(String p) { HashMap<String, Object> map = Maps.newHashMap(); try {//from w w w. j ava 2s . c o m if (p.contains("/show_page/")) { Document doc = Jsoup.parse(new URL(p), 15000); // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (filterUrls(linkHref)) { WebURL weburl = new WebURL(); weburl.setURL(linkHref); weburl.setJobName(conf.jobName); try { pendingUrls.addUrl(weburl); } catch (QueueException e) { log.error(e.getMessage()); } } } } String title = doc.select(".title .name").text(); if (StringUtils.isBlank(title)) return null; map.put("title", title); String category = doc.select(".title .type a").text(); if (StringUtils.isBlank(category)) return null; map.put("category", category); String _year = CharMatcher.DIGIT.retainFrom(doc.select(".title .pub").text()); if (StringUtils.isNotBlank(_year)) { int year = Integer.parseInt(_year); map.put("year", year); } String score = CharMatcher.DIGIT.retainFrom(doc.select(".ratingstar .num").text()); map.put("score", score); String alias = doc.select(".alias").text(); if (alias.contains(":")) { map.put("translation", alias.split(":")[1]); } String img = doc.select(".thumb img").attr("src"); if (StringUtils.isBlank(img)) return null; map.put("thumbnail", Lists.newArrayList(img)); String area = doc.select(".row2 .area a").text(); if (StringUtils.isBlank(area)) return null; map.put("area", area); String[] type = doc.select(".row2 .type a").text().split(" "); if (null == type || type.length == 0) return null; map.put("type", Lists.newArrayList(type)); String director = doc.select(".row2 .director a").text(); map.put("director", director); String _duration = CharMatcher.DIGIT.retainFrom(doc.select(".row2 .duration").text()); if (StringUtils.isNotBlank(_duration)) { int duration = Integer.parseInt(_duration); map.put("duration", duration); } String _hot = CharMatcher.anyOf(",").removeFrom(doc.select(".row2 .vr .num").text()); if (StringUtils.isNotBlank(_hot)) { int hot = Integer.parseInt(_hot); map.put("hot", hot); } String sumary = doc.select(".detail .long").text(); map.put("summary", sumary); // Elements elements = doc.select(".baseaction a"); HashMap<String, String> playList = Maps.newHashMap(); for (Element element : elements) { String n = element.text(); String urlString = element.attr("href"); if (StringUtils.isBlank(urlString)) return null; Document d2 = Jsoup.parse(new URL(urlString), 10000); if (null != d2) { String x = d2.select("#link2").attr("value"); if (StringUtils.isBlank(x)) return null; playList.put(n, x); } } map.put("online", playList); } else return null; } catch (Exception e) { return map; } return map; }
From source file:webcralwerproject1.Webcrawler.java
public int spider(String url, String word) { try {/*from w w w . j a v a 2 s . c om*/ Connection connection = Jsoup.connect(url); Document htmlDocument = connection.timeout(0).get(); //make connection this.htmlDocument = htmlDocument; //download page int httpStatuscode = connection.response().statusCode(); connection.ignoreHttpErrors(false);//ignoreHttpErrors - - false (default) if HTTP errors should be ignored. if (connection.response().statusCode() == 200) // 200 is the HTTP OK status code , indicating that everything is great. { System.out.println("\n**Visiting** Received web page at " + url); } else { System.out.println("\nHttpStstaus code" + httpStatuscode); } if (!connection.response().contentType().contains("text/html")) { System.out.println("**Failure** Retrieved something other than HTML"); return 0; } Elements linksOnPage = htmlDocument.select("a[href]"); System.out.println("Found (" + linksOnPage.size() + ") links"); for (Element link : linksOnPage) { if (word == "") { //searchword is null this.links.add(link.absUrl("href"));//copy all links word_found = 1; } else { if (link.attr("href").contains(word)) {//copy links that contain searchword this.links.add(link.absUrl("href")); word_found = 1; } else { word_notfound = 1; //search word not present set flag } } } if (word_found == 1) { //after copying all links write the downloaded content if (htmlDocument != null) { String path = writeContent(htmlDocument); writeReportHtml(url, path, httpStatuscode); } else { System.out.println("Inside Spider - HTMLDOCUMENt null"); } } if (word_notfound == 1 && word_found == 0) { return 0; //search word not present in any link } return 1;//word found } catch (IOException ioe) { // We were not successful in our HTTP request System.out.println("Inside Spider - excpetion occured: " + ioe); return -1; } }
From source file:webscrap.WebScrap.java
/** * @param args the command line arguments *///from www.j a va 2s . c om public static void main(String[] args) { // TODO code application logic here Document doc; try { doc = Jsoup.connect( "http://www.metmuseum.org/collection/the-collection-online/search/15538?pos=1&rpp=30&pg=1&rndkey=20150122&ft=*&deptids=2") .get(); File jsonFile = new File("Records.json"); FileWriter output = new FileWriter(jsonFile); JSONArray store = new JSONArray(); //Declarations for JSON output String nameTag = "Name"; String name; String artistTag = "Artist"; String artistName; String imgURLTag = "imgURL"; String imgsrc; String dateTag = "Date"; String date; String geoTag = "Geography"; String geoVal; String cultureTag = "Culture"; String culture; String mediumTag = "Medium"; String medium; String dimTag = "Dimension"; String dim; String classTag = "Classification"; String classification; String credit_line_tag = "Credit_Line"; String credit_line; String accessNumTag = "Accession_Number"; String accessNum; String RnRTag = "Rights_and_Reproduction"; String RnR; //trying to load the next urls String next = "http://www.metmuseum.org/collection/the-collection-online/search/11432?pos=1&rpp=30&pg=1&rndkey=20150123&ft=*&deptids=2"; int i = 500; while (i != 0) { name = ""; artistName = ""; imgsrc = ""; date = ""; //geoVal = "not available"; //culture = "not available"; medium = ""; dim = ""; classification = ""; credit_line = ""; accessNum = ""; //RnR = "not available"; doc = Jsoup.connect(next).get(); String o_title = doc.getElementsByTag("h2").text(); String[] part_o = o_title.split("Email"); String part_o1 = part_o[0]; String part_o2 = part_o[1]; //System.out.println(o_title); name = part_o1; //String artist = doc.getElementsByTag("h3").text(); //System.out.println(artist); //artistName = artist; Elements imgdiv = doc.select("div#inner-image-container img"); for (Element e : imgdiv) { imgsrc = e.absUrl("src"); } Elements divs; divs = doc.select("div.tombstone"); Elements divchild; divchild = divs.select("div"); int count = 0; for (Element div : divchild) { String info = div.text(); if (count != 0) { String[] parts = info.split(":"); String part1 = parts[0]; String part2 = parts[1]; switch (part1) { case "Artist": artistName = part2; break; case "Date": date = part2; break; case "Geography": geoVal = part2; break; case "Culture": culture = part2; break; case "Medium": medium = part2; break; case "Dimensions": dim = part2; break; case "Classification": classification = part2; break; case "Credit Line": credit_line = part2; break; case "Accession Number": accessNum = part2; break; case "Rights and Reproduction": RnR = part2; break; } } count++; } if (classification.equals(" Paintings")) { //System.out.println(nameTag+name); //System.out.println(artistTag+artistName); //System.out.println(imgURLTag+imgsrc); //System.out.println(dateTag+date); //System.out.println(mediumTag+medium); //System.out.println(dimTag+dim); //System.out.println(classTag+classification); //System.out.println(credit_line_tag+credit_line); //System.out.println(accessNumTag+accessNum); //System.out.println(i); //json writing JSONObject jsonObj = new JSONObject(); jsonObj.put(nameTag, name); jsonObj.put(artistTag, artistName); jsonObj.put(imgURLTag, imgsrc); jsonObj.put(dateTag, date); jsonObj.put(mediumTag, medium); jsonObj.put(dimTag, dim); jsonObj.put(classTag, classification); jsonObj.put(credit_line_tag, credit_line); jsonObj.put(accessNumTag, accessNum); store.add(jsonObj); i--; } //going to next page Element link = doc.select("a.next").first(); next = link.attr("abs:href"); } output.write(store.toJSONString()); output.write("\n"); output.flush(); output.close(); } catch (IOException e) { } }