List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:com.dajodi.scandic.JSoupScraper.java
@Override public Map<String, String> scrapeFormInputFields(InputStream inStream) { try {//from w w w .ja v a2s . c om Document doc = Jsoup.parse(inStream, HTTP.UTF_8, ""); Element form = doc.body().getElementById("aspnetForm"); Elements inputNodes = form.getElementsByTag("input"); Map<String, String> inputMap = new HashMap<String, String>(); for (Element element : inputNodes) { String name = element.attr("name"); String value = element.attr("value"); if (name != null) { inputMap.put(name, value == null ? "" : value); } else { //TODO: remove me Log.d("Something weird"); } } doc.empty(); return inputMap; } catch (Exception e) { throw new ScandicHtmlException(e); } }
From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesArticleExtractor.java
public Article extractArticle(String html) throws ParseException, IOException { Article result = new Article(); Document doc = Jsoup.parse(html, getBaseName()); Element element;// www . j a v a2s . com try { element = doc.select("article.rfd").iterator().next(); } catch (NoSuchElementException exception) { throw new IOException("Cannot find article.rfd element"); } // System.out.println(element); String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", ""); // time try { DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } catch (ParseException e) { // June 24, 2015 DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } // title result.setTitle(TextCleaningUtils.normalizeWithParagraphs(element.select("h1").text())); // text StringBuilder sb = new StringBuilder(); for (Element p : element.select("div.nytint-post > p")) { sb.append(p.text()); sb.append("\n"); } result.setText(TextCleaningUtils.normalizeWithParagraphs(sb.toString())); // debate title result.setDebateTitle(TextCleaningUtils .normalizeWithParagraphs(doc.select("div.nytint-discussion-overview > h2").text())); // debate url result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href")); // document url result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content")); // debate description result.setDebateDescription(TextCleaningUtils .normalizeWithParagraphs(((TextNode) doc.select("div.nytint-discussion-overview > p").iterator() .next().childNodes().iterator().next()).text())); // aurhor result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt")); // topics for (Element a : element.select("p.nytint-tags > a")) { result.getTopics().add(a.attr("href")); } return result; }
From source file:org.wallride.web.support.Posts.java
protected String parse(String html) { Document document = Jsoup.parse(html); Elements elements = document.select("img"); for (Element element : elements) { String src = element.attr("src"); if (src.startsWith(wallRideProperties.getMediaUrlPrefix())) { String style = element.attr("style"); Pattern pattern = Pattern.compile("width: ([0-9]+)px;"); Matcher matcher = pattern.matcher(element.attr("style")); if (matcher.find()) { String replaced = src + "?w=" + Integer.parseInt(matcher.group(1)) * 2; element.attr("src", replaced); }//ww w. ja v a 2 s. c o m } } return document.body().html(); }
From source file:web.analyzer.utils.Utils.java
public LinkResult getLinks(Document doc, String hostName) throws IOException { List<Link> linksInfo = new ArrayList<Link>(); int totalInternalLink = 0; int totalExternalLink = 0; Elements links = doc.select("a[href]"); for (Element link : links) { String href = link.attr("abs:href"); if (isValidUrl(href)) { URL url = new URL(href); String linkHostName = url.getHost(); String linkType = ""; if (linkHostName.equalsIgnoreCase(hostName)) { linkType = "internal"; totalInternalLink++;//from w ww. j a v a 2 s .c o m } else { linkType = "external"; totalExternalLink++; } linksInfo.add(new Link(href, linkType)); } } return new LinkResult(linksInfo, totalInternalLink, totalExternalLink); }
From source file:com.liato.bankdroid.banking.banks.Bioklubben.java
@Override protected LoginPackage preLogin() throws BankException, ClientProtocolException, IOException { urlopen = new Urllib(context, CertificateReader.getCertificates(context, R.raw.cert_bioklubben)); urlopen.setAllowCircularRedirects(true); response = urlopen.open("http://bioklubben.sf.se/Start.aspx"); Document d = Jsoup.parse(response); Element e = d.getElementById("__VIEWSTATE"); if (e == null || e.attr("value") == null) { throw new BankException(res.getText(R.string.unable_to_find).toString() + " ViewState."); }//from w ww . jav a 2s . c om String viewState = e.attr("value"); e = d.getElementById("__EVENTVALIDATION"); if (e == null || e.attr("value") == null) { throw new BankException(res.getText(R.string.unable_to_find).toString() + " EventValidation."); } String eventValidation = e.attr("value"); List<NameValuePair> postData = new ArrayList<NameValuePair>(); postData.add( new BasicNameValuePair("__EVENTTARGET", "ctl00$ContentPlaceHolder1$LoginUserControl$LogonButton")); postData.add(new BasicNameValuePair("__EVENTARGUMENT", "")); postData.add(new BasicNameValuePair("__VIEWSTATE", viewState)); postData.add(new BasicNameValuePair("__EVENTVALIDATION", eventValidation)); postData.add(new BasicNameValuePair("ctl00_toolkitscriptmanager_HiddenField", "")); postData.add(new BasicNameValuePair("ctl00$toolkitscriptmanager", "ctl00$UpdatePanel|ctl00$ContentPlaceHolder1$LoginUserControl$LogonButton")); postData.add( new BasicNameValuePair("ctl00$ContentPlaceHolder1$LoginUserControl$LoginNameTextBox", username)); postData.add( new BasicNameValuePair("ctl00$ContentPlaceHolder1$LoginUserControl$PasswordTextBox", password)); return new LoginPackage(urlopen, postData, response, "http://bioklubben.sf.se/Start.aspx"); }
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * if there are elements inside our top node that have a negative gravity score, let's * give em the boot// w ww. j a v a2 s . c om */ private void removeNodesWithNegativeScores() { Elements gravityItems = this.topNode.select("*[gravityScore]"); for (Element item : gravityItems) { int score = Integer.parseInt(item.attr("gravityScore")); if (score < 1) { item.remove(); } } }
From source file:com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl.java
/** * Runs around all the internal links and pulls out all the URLs * @param doc/* w w w . j a v a 2s .c o m*/ * @param baseUri */ private void setAnchors(Document doc, String baseUri) { Elements links = doc.select("a[href]"); for (Element link : links) { if (baseUri != null) link.setBaseUri(baseUri); String newLink = link.attr("abs:href"); if (newLink.indexOf("#") != -1) newLink = newLink.substring(0, newLink.indexOf("#")); anchors.add(newLink); } }
From source file:cn.cuizuoli.appranking.service.GooglePlayService.java
/** * getAppRankingList//from w w w. ja v a 2 s. c o m * @param feedType * @return */ public List<AppRanking> getAppRankingList(FeedType feedType, Category category) { List<AppRanking> appRankingList = new ArrayList<AppRanking>(); try { if (feedType.getMediaType() == MediaType.GOOGLE) { String url = StringUtils.EMPTY; if (category == Category.ALL) { url = getHotUrl(feedType); } else { url = getUrl(feedType, category); } log.info("Google Play -> " + url); if (StringUtils.isNotBlank(url)) { Document doc = appRankingRestTemplate.getForObject(url, Document.class); Elements elements = doc.select(".card-list>.card"); Iterator<Element> iterator = elements.iterator(); int i = 1; while (iterator.hasNext()) { Element element = iterator.next(); String appId = element.attr("data-docid"); String name = element.select(".details .title").attr("title"); String uri = element.select(".details .title").attr("href"); String artist = element.select(".details .subtitle").attr("title"); String price = element.select(".details button.price.buy>span").text(); String image170 = element.select(".cover .cover-image").attr("data-cover-small"); String image340 = element.select(".cover .cover-image").attr("data-cover-large"); AppRanking appRanking = new AppRanking(); appRanking.setAppId(appId); appRanking.setDeviceType(DeviceType.ANDROID); appRanking.setCountry(Country.JAPAN); appRanking.setMediaType(MediaType.GOOGLE); appRanking.setFeedType(feedType); appRanking.setRanking(i); appRanking.setTitle(name + " - " + artist); appRanking.setCategory(category.getCode()); appRanking.setUri(GOOGLE_PLAY_DOMAIN + uri); appRanking.setName(name); appRanking.setArtist(artist); appRanking.setPrice(price); appRanking.setImage53(image170); appRanking.setImage75(image170); appRanking.setImage100(image340); appRankingList.add(appRanking); i++; } } } } catch (HttpStatusCodeException e) { log.error(ExceptionUtils.getFullStackTrace(e)); } catch (Exception e) { log.error(ExceptionUtils.getFullStackTrace(e)); } return appRankingList; }
From source file:blackman.matt.board.Post.java
/** * Formats the HTML on the post text to accurately display it on the post. * * @param post The unformatted text of the post. * @return A formatted version of the post. *///w w w . j a v a2s .co m private String formatPostBody(String post) { Document formattedText = Jsoup.parse(post); Pattern p = Pattern.compile("^/.*/index\\.html"); // Red Text Elements redTexts = formattedText.getElementsByClass("heading"); for (Element text : redTexts) { text.wrap("<font color=\"#AF0A0F\"><strong></strong></font>"); } // Green text Elements greenTexts = formattedText.getElementsByClass("quote"); for (Element text : greenTexts) { text.wrap("<font color=\"#789922\"></font>"); } // Board Links Elements boardLinks = formattedText.select("a"); for (Element link : boardLinks) { String url = link.attr("href"); Matcher m = p.matcher(url); if (m.matches()) { link.attr("href", "http://8chan.co" + url); } } // Reply links Elements replyLinks = formattedText.select("a[onclick^=highlightReply"); for (Element reply : replyLinks) { repliedTo.add(reply.attr("href").split("#")[1]); boardLinks.attr("href", "http://8chan.co" + reply.attr("href")); } // Post too long text removal Elements tooLongs = formattedText.getElementsByClass("toolong"); for (Element text : tooLongs) { text.text(""); } return formattedText.toString(); }
From source file:jobhunter.dice.Client.java
public Job execute() throws IOException, URISyntaxException { l.debug("Connecting to {}", url); update("Connecting", 1L); final Document doc = Jsoup.connect(url).get(); update("Parsing HTML", 2L); final Job job = Job.of(); job.setPortal(DicePlugin.portal);/*from w w w . java 2 s .co m*/ job.setLink(url); StringBuilder description = new StringBuilder(); for (Element meta : doc.getElementsByTag("meta")) { l.debug("Checking {}", meta.toString()); if (meta.attr("name").equals("twitter:text:job_title")) job.setPosition(meta.attr("content")); if (meta.attr("name").equals("twitter:text:company")) job.getCompany().setName(meta.attr("content")); if (meta.attr("name").equals("twitter:text:city")) job.setAddress(meta.attr("content")); if (meta.attr("name").equals("twitter:text:salary")) job.setSalary(meta.attr("content")); if (meta.attr("name").equals("twitter:text:job_description_web")) { description.append(StringEscapeUtils.unescapeHtml4(meta.attr("content"))); } if (meta.attr("name").equals("twitter:text:skills")) { description.append(StringEscapeUtils.unescapeHtml4(meta.attr("content"))); } } job.setDescription(description.toString()); update("Done", 3L); return job; }