List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:com.screenslicer.core.util.Util.java
public static WebElement toElement(RemoteWebDriver driver, HtmlNode htmlNode, Element body) throws ActionFailed { if (body == null) { body = Util.openElement(driver, null, null, null); }//from w w w . jav a 2 s. c om if (!CommonUtil.isEmpty(htmlNode.id)) { WebElement element = toElement(driver, body.getElementById(htmlNode.id)); if (element != null) { return element; } } List<Elements> selected = new ArrayList<Elements>(); if (!CommonUtil.isEmpty(htmlNode.tagName)) { selected.add(body.getElementsByTag(htmlNode.tagName)); } else if (!CommonUtil.isEmpty(htmlNode.href)) { selected.add(body.getElementsByTag("a")); } if (!CommonUtil.isEmpty(htmlNode.name)) { selected.add(body.getElementsByAttributeValue("name", htmlNode.name)); } if (!CommonUtil.isEmpty(htmlNode.type)) { selected.add(body.getElementsByAttributeValue("type", htmlNode.type)); } if (!CommonUtil.isEmpty(htmlNode.value)) { selected.add(body.getElementsByAttributeValue("value", htmlNode.value)); } if (!CommonUtil.isEmpty(htmlNode.title)) { selected.add(body.getElementsByAttributeValue("title", htmlNode.title)); } if (htmlNode.classes != null && htmlNode.classes.length > 0) { Map<Element, Integer> found = new HashMap<Element, Integer>(); for (int i = 0; i < htmlNode.classes.length; i++) { Elements elements = body.getElementsByClass(htmlNode.classes[i]); for (Element element : elements) { if (!found.containsKey(element)) { found.put(element, 0); } found.put(element, found.get(element) + 1); } } Elements elements = new Elements(); for (int i = htmlNode.classes.length; i > 0; i--) { for (Map.Entry<Element, Integer> entry : found.entrySet()) { if (entry.getValue() == i) { elements.add(entry.getKey()); } } if (!elements.isEmpty()) { break; } } selected.add(elements); } if (!CommonUtil.isEmpty(htmlNode.href)) { Elements hrefs = body.getElementsByAttribute("href"); Elements toAdd = new Elements(); String currentUrl = driver.getCurrentUrl(); String hrefGiven = htmlNode.href; for (Element href : hrefs) { String hrefFound = href.attr("href"); if (hrefGiven.equalsIgnoreCase(hrefFound)) { toAdd.add(href); } else { String uriGiven = Util.toCanonicalUri(currentUrl, hrefGiven); String uriFound = Util.toCanonicalUri(currentUrl, hrefFound); if (uriGiven.equalsIgnoreCase(uriFound)) { toAdd.add(href); } } } selected.add(toAdd); } if (!CommonUtil.isEmpty(htmlNode.innerText)) { selected.add(body.getElementsMatchingText(Pattern.quote(htmlNode.innerText))); } if (htmlNode.multiple != null) { selected.add(body.getElementsByAttribute("multiple")); } Map<Element, Integer> votes = new HashMap<Element, Integer>(); for (Elements elements : selected) { for (Element element : elements) { if (!Util.isHidden(element)) { if (!votes.containsKey(element)) { votes.put(element, 0); } votes.put(element, votes.get(element) + 1); } } } int maxVote = 0; Element maxElement = null; for (Map.Entry<Element, Integer> entry : votes.entrySet()) { if (entry.getValue() > maxVote) { maxVote = entry.getValue(); maxElement = entry.getKey(); } } return toElement(driver, maxElement); }
From source file:org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java
/** * Parse the outlinks from a href tags in the document, and return them as a * list/* w w w .j a va 2s. co m*/ * * @param doc The document to parse * @return The list of outlinks as URL strings. */ private List<String> getOutlinksFromDocument(Document doc) { List<String> outlinks = new ArrayList<String>(); Elements linkElements = doc.select("a[href]"); for (Element linkElement : linkElements) { outlinks.add(linkElement.attr("href").trim()); } return outlinks; }
From source file:com.pemikir.youtubeplus.youtube.YoutubeExtractor.java
private VideoInfoItem extractVideoInfoItem(Element li) { VideoInfoItem info = new VideoInfoItem(); info.webpage_url = li.select("a[class*=\"content-link\"]").first().attr("abs:href"); try {//from ww w . j a v a 2 s . co m Pattern p = Pattern.compile("v=([0-9a-zA-Z-]*)"); Matcher m = p.matcher(info.webpage_url); m.find(); info.id = m.group(1); } catch (Exception e) { e.printStackTrace(); } info.title = li.select("span[class=\"title\"]").first().text(); info.uploader = li.select("span[class=\"g-hovercard\"]").first().text(); info.duration = li.select("span[class=\"video-time\"]").first().text(); Element img = li.select("img").first(); info.thumbnail_url = img.attr("abs:src"); // Sometimes youtube sends links to gif files witch somehow seam to not exist // anymore. Items with such gif also offer a secondary image source. So we are going // to use that if we caught such an item. if (info.thumbnail_url.contains(".gif")) { info.thumbnail_url = img.attr("data-thumb"); } return info; }
From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java
/** * ???//w ww. ja v a 2 s . c o m */ @Override public Collection<HttpSeed> findPageSeed(Collection<HttpSeed> seeds) throws Exception { if (CollectionUtils.isEmpty(seeds)) { return null; } Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>(); for (HttpSeed seed : seeds) { Document doc = parse(seed.getHtml()); // ?URL Elements page_form_elements = doc.select("#pageForm"); if (page_form_elements.isEmpty()) { return null; } Element page_form_e = page_form_elements.get(0); // URL String url = DOMAIN + page_form_e.attr("action"); Elements param_elements = page_form_e.select("input"); // int totalPageNum = this.getTotalPageNum(doc); for (int pageNo = 1; pageNo <= totalPageNum; pageNo++) { // ? Map<String, String> params = new HashMap<String, String>(); for (Element param_e : param_elements) { params.put(param_e.attr("name"), param_e.attr("value")); } // params.put("curstart", String.valueOf(pageNo)); HttpSeed httpSeed = this.initListHttpSeed(url, params); seedGroups.add(httpSeed); } } return seedGroups; }
From source file:org.confab.PhpBB3Parser.java
public void postForumThread(Forum targetForum, Post newPost, User user) { Utilities.debug("postForumThread"); try {// w ww . ja v a 2s. c om String reply_page = targetForum.rootURL() + "newthread.php?do=newthread&f=" + targetForum.id; Utilities.debug("GET: " + reply_page); HttpGet httpget = new HttpGet(reply_page); HttpResponse response = httpclient.execute(httpget, user.httpContext); HttpEntity entity = response.getEntity(); Document page = Jsoup.parse(EntityUtils.toString(entity)); EntityUtils.consume(entity); assert page != null; // TODO: need check to make sure we're on the right page. HttpEntity's // can just contain garbage and jsoup will still consume it // Make sure we're logged in before going any further Element username_box = page.select("input[name=vb_login_username]").first(); assert username_box == null; Element password_box = page.select("input[name=vb_login_password]").first(); assert password_box == null; // Construct POST HttpPost httppost = new HttpPost(targetForum.rootURL() + "newthread.php"); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); // TODO: fix subject nvps.add(new BasicNameValuePair("subject", "hello world")); nvps.add(new BasicNameValuePair("message", newPost.message)); // Find the form - we can parse the rest of the needed elements from it Element reply_form = page.select("form[action*=newthread.php?do=postthread&f=]").first(); assert reply_form != null; String[] vals_array = { "s", "securitytoken", "f", "do", "posthash", "poststarttime", "loggedinuser" }; List<String> vals = Arrays.asList(vals_array); for (String val : vals) { Element el = reply_form.select("input[name=" + val + "]").first(); assert el != null : val; nvps.add(new BasicNameValuePair(val, el.attr("value"))); } httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8)); // Execute the POST Utilities.debug("Executing POST"); response = httpclient.execute(httppost, user.httpContext); Utilities.debug("POST response: " + response.getStatusLine()); assert response.getStatusLine().getStatusCode() == 302; } catch (IOException e) { System.out.println(e); } Utilities.debug("end postForumThread"); }
From source file:tkbautobooking.BookingSystem.java
private void praseBookingPage() throws Exception { Document doc = Jsoup.parse(BookingPageHTML); Element class_selector = doc.getElementById("class_selector"); if (class_selector == null) throw new Exception("Prase Booking Page fail !"); classMap = new TreeMap<>(); for (Element option : class_selector.getElementsByTag("option")) { if (option.attr("value").equals("")) continue; classMap.put(option.attr("value"), option.text().replace("", " ")); }/*ww w . j a v a 2 s . c o m*/ }
From source file:org.confab.PhpBB3Parser.java
public void createPost(Post replyTo, Post newPost, User user) { Utilities.debug("createPost"); try {/*from www.j a va2 s . co m*/ String reply_page = replyTo.rootURL() + "newreply.php?do=newreply&noquote=1&p=" + replyTo.id; HttpGet httpget = new HttpGet(reply_page); HttpResponse response = httpclient.execute(httpget, user.httpContext); HttpEntity entity = response.getEntity(); Document page = Jsoup.parse(EntityUtils.toString(entity)); EntityUtils.consume(entity); assert page != null; // TODO: need check to make sure we're on the right page. HttpEntity's // can just contain garbage and jsoup will still consume it // Make sure we're logged in before going any further Element username_box = page.select("input[name=vb_login_username]").first(); assert username_box == null; Element password_box = page.select("input[name=vb_login_password]").first(); assert password_box == null; // Construct POST HttpPost httppost = new HttpPost(replyTo.rootURL() + "newreply.php"); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); // There is a title param but think it's optional.. //nvps.add(new BasicNameValuePair("title", ""); nvps.add(new BasicNameValuePair("message", newPost.message)); // Find the form - we can parse the rest of the needed elements from it Element reply_form = page.select("form[action*=newreply.php?do=postreply&t=]").first(); assert reply_form != null; String[] vals_array = { "s", "securitytoken", "do", "t", "p", "specifiedpost", "posthash", "poststarttime", "loggedinuser", "multiquoteempty" }; List<String> vals = Arrays.asList(vals_array); for (String val : vals) { Element el = reply_form.select("input[name=" + val + "]").first(); assert el != null : val; nvps.add(new BasicNameValuePair(val, el.attr("value"))); } httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8)); // Execute the POST Utilities.debug("Executing POST"); response = httpclient.execute(httppost, user.httpContext); Utilities.debug("POST response: " + response.getStatusLine()); assert response.getStatusLine().getStatusCode() == 302; } catch (IOException e) { System.out.println(e); } Utilities.debug("end createPost"); }
From source file:org.apache.karaf.cave.server.storage.CaveRepositoryImpl.java
/** * Proxy a HTTP URL locally./* ww w. j a v a 2s .c o m*/ * * @param url the HTTP URL to proxy. * @param filter regex filter. Only artifacts URL matching the filter will be considered. * @throws Exception in case of proxy failure. */ private void proxyHttp(String url, String filter) throws Exception { LOGGER.debug("Proxying HTTP URL {}", url); HttpClient httpClient = new DefaultHttpClient(); HttpGet httpGet = new HttpGet(url); HttpResponse response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); if (entity != null) { if (entity.getContentType().getValue().equals("application/java-archive") || entity.getContentType().getValue().equals("application/octet-stream")) { // I have a jar/binary, potentially a resource try { if ((filter == null) || (url.matches(filter))) { Resource resource = new DataModelHelperImpl().createResource(new URL(url)); if (resource != null) { obrRepository.addResource(resource); obrRepository.setLastModified(System.currentTimeMillis()); } } } catch (IllegalArgumentException e) { LOGGER.warn(e.getMessage()); } } else { // try to find link to "browse" try { Document document = Jsoup.connect(url).get(); Elements links = document.select("a"); if (links.size() > 1) { for (int i = 1; i < links.size(); i++) { Element link = links.get(i); String absoluteHref = link.attr("abs:href"); this.proxyHttp(absoluteHref, filter); } } } catch (UnsupportedMimeTypeException e) { // ignore } } } }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Method to get the number of links (total, internal) * @param link_html the url to parse// www. ja v a2 s. c om * @return the number of links */ public int[] getnlinks(String link_html) { int[] nlinks = new int[2]; nlinks[0] = 0;//total number of links nlinks[1] = 0;//number of internal links try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); Elements links = doc.select("a[href]"); nlinks[0] = links.size(); //----we check if a link is internal or not (abs is used to get the whole link (abs stands for abs) for (Element link : links) { if (link.attr("abs:href").contains(link_html)) { nlinks[1]++; } } return nlinks; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); return nlinks; } }
From source file:crawler.AScraper.java
@Transformer(inputChannel = "channel3", outputChannel = "channel4") public Artwork convert(Element payload) throws ParseException, MalformedURLException { Matcher m = patter.matcher(payload.text()); if (m.find()) { String year = m.group("year"); String month = m.group("month"); String day = m.group("day"); int id = Integer.parseInt(m.group("id")); String model = m.group("model").split("[\\s\\[\\]]")[0]; URL link = new URL(payload.attr("href")); DateFormat format = new SimpleDateFormat("yyyy-MM-dd"); format.setTimeZone(TimeZone.getTimeZone("GMT+8")); Date date = format.parse(String.format("%s-%s-%s", year, month, day)); String thread_title = payload.text(); return new Artwork(thread_title, id, -1, -1, null, link, null, model, date); } else {/*from ww w. j a v a2s. c o m*/ LOG.error(payload.text()); return null; } }