List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:org.apache.karaf.cave.server.storage.CaveRepositoryImpl.java
/** * Populate the Cave repository using the given URL. * * @param url the "source" HTTP URL.//from w w w.j a va 2s . c o m * @param filter regex filter. Only artifacts URL matching the filter will be considered. * @param update true if the OBR metadata should be updated, false else. * @throws Exception in case of populate failure. */ private void populateFromHttp(String url, String filter, boolean update) throws Exception { LOGGER.debug("Populating from HTTP URL {}", url); HttpClient httpClient = new DefaultHttpClient(); HttpGet httpGet = new HttpGet(url); HttpResponse response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); if (entity != null) { if (entity.getContentType().getValue().equals("application/java-archive") || entity.getContentType().getValue().equals("application/octet-stream")) { // I have a jar/binary, potentially a resource try { if ((filter == null) || (url.matches(filter))) { ResourceImpl resource = (ResourceImpl) new DataModelHelperImpl() .createResource(new URL(url)); if (resource != null) { LOGGER.debug("Copy {} into the Cave repository storage", url); int index = url.lastIndexOf("/"); if (index > 0) { url = url.substring(index); } File destination = new File(new File(this.getLocation()), url); FileOutputStream outputStream = new FileOutputStream(destination); entity.writeTo(outputStream); outputStream.flush(); outputStream.close(); if (update) { resource = (ResourceImpl) new DataModelHelperImpl() .createResource(destination.toURI().toURL()); LOGGER.debug("Update OBR metadata with {}", resource.getId()); this.addResource(resource); } } } } catch (IllegalArgumentException e) { LOGGER.warn(e.getMessage()); } } else { // try to find link to "browse" Document document = Jsoup.connect(url).get(); Elements links = document.select("a"); if (links.size() > 1) { for (int i = 1; i < links.size(); i++) { Element link = links.get(i); String absoluteHref = link.attr("abs:href"); this.populateFromHttp(absoluteHref, filter, update); } } } } }
From source file:org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java
/** * Parse the keywords out of the meta tag if one exists. Otherwise, return an * empty list./*from w ww. ja v a 2 s . c o m*/ * * @param doc The Document ot parse * @return The list of keywords. */ private List<String> getKeywordsFromDocument(Document doc) { List<String> keywords = new ArrayList<String>(); Elements keywordsElements = doc.select("meta[name=keywords]"); for (Element keywordsElement : keywordsElements) { for (String keyword : keywordsElement.attr("content").split(",")) { keywords.add(keyword.trim()); } } return keywords; }
From source file:org.confab.PhpBB3Parser.java
/** * Constructs and submits a POST with the appropriate parameters to login to a vbulletin. * @param rootURL Base or root URL for the site to log into * @param username User's login name/* w w w. ja v a 2 s . com*/ * @param password User's password * @return User object initialised with a HttpContext */ public User login(String rootURL, String username, String password) { Utilities.debug("login"); User ret = new User(username, password); CookieStore cookieStore = new BasicCookieStore(); HttpContext localContext = new BasicHttpContext(); localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore); try { // set up the POST HttpPost httppost = new HttpPost(rootURL + "login.php"); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); nvps.add(new BasicNameValuePair("do", "login")); nvps.add(new BasicNameValuePair("vb_login_username", username)); nvps.add(new BasicNameValuePair("vb_login_password", "")); nvps.add(new BasicNameValuePair("s", "")); nvps.add(new BasicNameValuePair("securitytoken", "guest")); nvps.add(new BasicNameValuePair("do", "login")); nvps.add(new BasicNameValuePair("vb_login_md5password", Utilities.md5(password))); nvps.add(new BasicNameValuePair("vb_login_md5password_utf", Utilities.md5(password))); httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8)); // execute the POST Utilities.debug("Executing POST"); HttpResponse response = httpclient.execute(httppost, localContext); Utilities.debug("POST response: " + response.getStatusLine()); assert response.getStatusLine().getStatusCode() == 200; //TODO: store the cookies //http://bit.ly/e7yY5i (CookieStore javadoc) Utilities.printCookieStore(cookieStore); // confirm we are logged in HttpGet httpget = new HttpGet(rootURL); response = httpclient.execute(httpget, localContext); HttpEntity entity = response.getEntity(); Document page = Jsoup.parse(EntityUtils.toString(entity)); EntityUtils.consume(entity); assert page != null; Utilities.debug("Checking that we are logged in.."); Element username_box = page.select("input[name=vb_login_username]").first(); assert username_box == null; Element password_box = page.select("input[name=vb_login_password]").first(); assert password_box == null; // parse the user's new securitytoken Element el_security_token = page.select("input[name=securitytoken]").first(); assert el_security_token != null; String security_token = el_security_token.attr("value"); assert security_token != null; String[] token_array = security_token.split("-"); assert token_array.length == 2; ret.vb_security_token = token_array[1]; assert ret.vb_security_token.length() == 40; Utilities.debug("securitytoken: " + ret.vb_security_token); Utilities.debug("Login seems ok"); ret.httpContext = localContext; } catch (IOException e) { System.out.println(e); } Utilities.debug("end login"); return ret; }
From source file:me.vertretungsplan.parser.UntisInfoParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); Document navbarDoc = Jsoup.parse(getNavbarDoc().replace(" ", "")); Element select = navbarDoc.select("select[name=week]").first(); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); String info = navbarDoc.select(".description").text(); String lastChange;/*from w w w .ja v a 2 s . co m*/ try { lastChange = info.substring(info.indexOf("Stand:") + "Stand:".length()).trim(); } catch (Exception e) { try { String infoHtml = httpGet(baseUrl + "/frames/title.htm", data.optString(PARAM_ENCODING, null)); Document infoDoc = Jsoup.parse(infoHtml); String info2 = infoDoc.select(".description").text(); lastChange = info2.substring(info2.indexOf("Stand:") + "Stand:".length()).trim(); } catch (Exception e1) { lastChange = ""; } } int successfulWeeks = 0; HttpResponseException lastException = null; for (Element option : select.children()) { String week = option.attr("value"); String weekName = option.text(); if (data.optBoolean(PARAM_SINGLE_CLASSES, data.optBoolean("single_classes", false)) // backwards compatibility || data.optString(PARAM_SCHEDULE_TYPE, "substitution").equals("timetable")) { int classNumber = 1; for (String klasse : getAllClasses()) { String url = getScheduleUrl(week, classNumber, data); try { parsePage(v, lastChange, klasse, url, weekName); } catch (HttpResponseException e) { if (e.getStatusCode() == 500) { // occurs in Hannover_MMBS classNumber++; continue; } else { throw e; } } classNumber++; } successfulWeeks++; } else { String url = getScheduleUrl(week, 0, data); try { parsePage(v, lastChange, null, url, weekName); successfulWeeks++; } catch (HttpResponseException e) { lastException = e; } } } if (successfulWeeks == 0 && lastException != null) { throw lastException; } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); v.setWebsite(baseUrl + "/default.htm"); return v; }
From source file:net.kevxu.purdueassist.course.CatalogDetail.java
private CatalogDetailEntry parseDocument(Document document) throws HtmlParseException, CourseNotFoundException, IOException { CatalogDetailEntry entry = new CatalogDetailEntry(subject, cnbr); Elements tableElements = document.getElementsByAttributeValue("summary", "This table lists the course detail for the selected term."); if (tableElements.isEmpty() != true) { // get name try {/*from w w w.j av a2 s .c om*/ Element body = tableElements.first().select("tbody").first(); String nameBlock = body.select("tr td.nttitle").first().text(); String[] temp = nameBlock.split(subject.name() + " " + String.valueOf(cnbr)); String name = temp[temp.length - 1].substring(3); entry.setName(name); // get description body = body.select(".ntdefault").first(); String text = body.text(); int split = text.indexOf("Levels:"); String description = text.substring(0, split); description = description.substring(20); entry.setDescription(description); // get levels int begin = split; int end = text.indexOf("Schedule Types:"); String levels = text.substring(begin + 8, end); temp = levels.split("[ ,]"); List<String> lvs = new ArrayList<String>(); for (String s : temp) if (!s.equals("")) { lvs.add(s); } entry.setLevels(lvs); // get type and prerequisites List<Type> types = new ArrayList<Type>(); List<String> preq = new ArrayList<String>(); Elements parsing_A = body.select("a"); for (Element e : parsing_A) { if (e.attr("href").contains("schd_in") && !(e.attr("href").contains("%"))) { try { types.add(Type.valueOf(e.text().replace(" ", ""))); } catch (Exception exception) { throw new HtmlParseException(); } } else if (e.attr("href").contains("sel_attr=")) { preq.add(e.text()); } } if (types.size() > 0) entry.setType(types); if (preq.size() > 0) entry.setPrerequisites(preq); // get offered by begin = text.indexOf("Offered By:"); end = text.indexOf("Department:"); if (end < 0) end = text.indexOf("Course Attributes:"); if (end > 0) { entry.setOfferedBy(text.substring(begin + 12, end - 1)); } // get department begin = text.indexOf("Department:"); if (begin > 0) { end = text.indexOf("Course Attributes:"); entry.setDepartment((text.substring(begin + 12, end - 1))); } // get campus begin = text.indexOf("May be offered at any of the following campuses:"); String campuses; end = text.indexOf("Repeatable for Additional Credit:"); if (end < 0) end = text.indexOf("Learning Objectives:"); if (end < 0) end = text.indexOf("Restrictions:"); if (end < 0) end = text.indexOf("Corequisites:"); if (end < 0) end = text.indexOf("Prerequisites:"); if (end < 0) { campuses = text .substring(begin + "May be offered at any of the following campuses:".length() + 5); } else { campuses = text.substring( begin + "May be offered at any of the following campuses:".length() + 5, end - 1); } temp = campuses.replace(" ", "#").split("#"); List<String> camps = new ArrayList<String>(); for (String s : temp) { if (s.length() > 1) { camps.add(s); } } entry.setCampuses(camps); // get restrictions begin = text.indexOf("Restrictions:"); end = text.indexOf("Corequisites:"); if (end < 0) end = text.indexOf("Prerequisites:"); if (begin > 0 && end < 0) { entry.setRestrictions( text.substring(begin + "Restrictions:".length()).replace(" ", "\n")); } else if (begin > 0) { entry.setRestrictions( text.substring(begin + "Restrictions:".length(), end).replace(" ", "\n")); } } catch (StringIndexOutOfBoundsException e) { // no type, not available // System.out.println("-----------"); // System.out.println("Error for cnbr = " + cnbr); // System.out.println("-----------"); } } else { throw new CourseNotFoundException(); } return entry; }
From source file:com.serphacker.serposcope.scraper.google.scraper.GoogleScraper.java
protected String extractLink(Element element) { if (element == null) { return null; }//from w w w.j a v a 2s . co m String attr = element.attr("href"); if (attr == null) { return null; } if ((attr.startsWith("http://www.google") || attr.startsWith("https://www.google"))) { if (attr.contains("/aclk?")) { return null; } } if (attr.startsWith("http://") || attr.startsWith("https://")) { return attr; } if (attr.startsWith("/url?")) { try { List<NameValuePair> parse = URLEncodedUtils.parse(attr.substring(5), Charset.forName("utf-8")); Map<String, String> map = parse.stream() .collect(Collectors.toMap(NameValuePair::getName, NameValuePair::getValue)); return map.get("q"); } catch (Exception ex) { return null; } } return null; }
From source file:com.liato.bankdroid.banking.banks.coop.Coop.java
@Override public void update() throws BankException, LoginException, BankChoiceException { super.update(); if (username == null || password == null || username.length() == 0 || password.length() == 0) { throw new LoginException(res.getText(R.string.invalid_username_password).toString()); }/*from w w w. j a va2 s .c o m*/ login(); try { for (AccountType at : AccountType.values()) { response = urlopen.open(at.getUrl()); Document d = Jsoup.parse(response); Elements historik = d.select("#historik section"); TransactionParams params = new TransactionParams(); mTransactionParams.put(at, params); if (historik != null && !historik.isEmpty()) { String data = historik.first().attr("data-controller"); Matcher m = rePageGuid.matcher(data); if (m.find()) { params.setPageGuid(m.group(1)); } } Element date = d.getElementById("dateFrom"); if (date != null) { params.setMinDate(date.hasAttr("min") ? date.attr("min") : null); params.setMaxDate(date.hasAttr("max") ? date.attr("max") : null); } Elements es = d.select(".List:contains(Saldo)"); if (es != null && !es.isEmpty()) { List<String> names = new ArrayList<String>(); List<String> values = new ArrayList<String>(); for (Element e : es.first().select("dt")) { names.add(e.text().replaceAll(":", "").trim()); } for (Element e : es.first().select("dd")) { values.add(e.text().trim()); } for (int i = 0; i < Math.min(names.size(), values.size()); i++) { Account a = new Account(names.get(i), Helpers.parseBalance(values.get(i)), String.format("%s%d", at.getPrefix(), i)); a.setCurrency(Helpers.parseCurrency(values.get(i), "SEK")); if (a.getName().toLowerCase().contains("disponibelt")) { a.setType(Account.REGULAR); balance = a.getBalance(); setCurrency(a.getCurrency()); } else { a.setType(Account.OTHER); } if (i > 0) { a.setAliasfor(String.format("%s%d", at.getPrefix(), 0)); } accounts.add(a); } } } } catch (ClientProtocolException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (IOException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } try { RefundSummaryRequest refsumReq = new RefundSummaryRequest(mUserId, mToken, APPLICATION_ID); HttpEntity e = new StringEntity(getObjectmapper().writeValueAsString(refsumReq)); InputStream is = urlopen .openStream("https://www.coop.se/ExternalServices/RefundService.svc/RefundSummary", e, true); RefundSummaryResponse refsumResp = readJsonValue(is, RefundSummaryResponse.class); if (refsumResp != null && refsumResp.getRefundSummaryResult() != null) { Account a = new Account("terbring p ditt kort", BigDecimal.valueOf(refsumResp.getRefundSummaryResult().getAccountBalance()), "refsummary"); a.setCurrency("SEK"); if (accounts.isEmpty()) { balance = a.getBalance(); setCurrency(a.getCurrency()); } accounts.add(a); a = new Account( String.format("terbring fr %s", refsumResp.getRefundSummaryResult().getMonthName()), BigDecimal.valueOf(refsumResp.getRefundSummaryResult().getTotalRefund()), "refsummary_month"); accounts.add(a); } } catch (JsonParseException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (ClientProtocolException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } catch (IOException e) { e.printStackTrace(); throw new BankException(e.getMessage()); } if (accounts.isEmpty()) { throw new BankException(res.getText(R.string.no_accounts_found).toString()); } super.updateComplete(); }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Parse the url and get all the content * @param link_html the url to parse/* w w w. j a va2 s. com*/ * @return The content parsed */ public String cleanhtml(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) { link_html = link_html.substring(0, link_html.length() - 1); } if (link_html.substring(0, 5).equalsIgnoreCase("https")) { link_html = link_html.substring(8); } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) { link_html = link_html.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element link : links) { String str_check = link.attr("abs:href").toString(); if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) { anchortext = anchortext + link.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:org.bungeni.ext.integration.bungeniportal.BungeniServiceAccess.java
private List<BasicNameValuePair> getActionsViewButtonInfo(Document doc) { List<BasicNameValuePair> nvp = new ArrayList<BasicNameValuePair>(0); Elements inputList = doc.select("div#actionsView input"); for (int i = 0; i < inputList.size(); i++) { Element inputItem = inputList.get(i); nvp.add(new BasicNameValuePair(inputItem.attr("name"), inputItem.attr("value"))); }/* w ww. j a va 2 s. c om*/ return nvp; }
From source file:de.geeksfactory.opacclient.apis.WebOpacNet.java
@Override public List<SearchField> getSearchFields() throws IOException, JSONException { List<SearchField> fields = new ArrayList<>(); // Text fields String html = httpGet(opac_url + "/de/mobile/default.aspx", getDefaultEncoding()); Document doc = Jsoup.parse(html); Elements options = doc.select("#drpOptSearchT option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setDisplayName(option.text()); field.setId(option.attr("value")); field.setData(new JSONObject("{\"filter\":false}")); field.setHint(""); fields.add(field);/*from w w w . j ava 2 s .c om*/ } // Dropdowns String text = httpGet(opac_url + "/de/mobile/GetRestrictions.ashx", getDefaultEncoding()); JSONArray filters = new JSONObject(text).getJSONArray("restrcontainers"); for (int i = 0; i < filters.length(); i++) { JSONObject filter = filters.getJSONObject(i); if (filter.getString("querytyp").equals("EJ")) { // Querying by year also works for other years than the ones // listed // -> Make it a text field instead of a dropdown TextSearchField field = new TextSearchField(); field.setDisplayName(filter.getString("kopf")); field.setId(filter.getString("querytyp")); field.setData(new JSONObject("{\"filter\":true}")); field.setHint(""); fields.add(field); } else { DropdownSearchField field = new DropdownSearchField(); field.setId(filter.getString("querytyp")); field.setDisplayName(filter.getString("kopf")); JSONArray restrictions = filter.getJSONArray("restrictions"); field.addDropdownValue("", "Alle"); for (int j = 0; j < restrictions.length(); j++) { JSONObject restriction = restrictions.getJSONObject(j); field.addDropdownValue(restriction.getString("id"), restriction.getString("bez")); } field.setData(new JSONObject("{\"filter\":true}")); fields.add(field); } } return fields; }