List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:feedzilla.Feed.java
@Override public void run() { try {//from w w w . ja va 2 s.c o m Thread.sleep((new Random()).nextInt(60 * 1000)); } catch (InterruptedException ex) { Log.warn("Could not sleep Thread", ex); } Document doc = null; boolean get = true; int trysCount = 0; do { get = true; try { doc = Jsoup.connect(this.link).timeout(60 * 1000).userAgent( "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6") .referrer("http://www.google.com").get(); } catch (IOException ex) { Logger.getLogger(Feed.class.getName()).log(Level.SEVERE, null, ex); Log.warn("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - Could not get Feed page from FeedZilla", ex); get = false; if (++trysCount > 5) { Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - " + "Five attempts and has not yet been possible to " + "retrieve the page from filezilla. Ignoring this news."); return; } } } while (!get); Elements elements = doc.body().select("iframe"); for (Element element : elements) { try { this.link = URLDecoder.decode(element.attr("src"), "UTF-8"); } catch (UnsupportedEncodingException ex) { Logger.getLogger(Feed.class.getName()).log(Level.SEVERE, null, ex); Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - " + "Could not get the news link from FeedZilla pages"); return; } } this.link = getUrlInParams(this.link); try { this.news = (new NewsCrawler(this.link)).getNews(); } catch (Exception ex) { Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - " + "Could not retrieve news from link " + this.link, ex); return; } newsXMLFile.getParentFile().mkdirs(); try { FileUtils.writeStringToFile(newsXMLFile, this.toXML()); Log.info("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - Successfuly saved!"); System.out.println("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - Successfuly saved!"); } catch (IOException ex) { Log.error("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - Could not save news into file", ex); } }
From source file:com.megatome.j2d.support.JavadocSupport.java
private static List<SearchIndexValue> indexFile(File f) throws BuilderException { final List<SearchIndexValue> values = new ArrayList<>(); final Elements elements = loadAndFindLinks(f); for (final Element e : elements) { Element parent = e.parent(); if (!parent.child(0).equals(e)) { continue; }//from www. ja v a2 s . co m final String parentTagName = parent.tagName(); if (parentPattern.matcher(parentTagName).matches()) { parent = parent.parent(); if (!parent.child(0).equals(e.parent())) { continue; } } if (!containsIgnoreCase(parentTagName, "dt")) { continue; } final String text = parent.text(); final String name = e.text(); final String className = parent.className(); final MatchType type = getMatchingType(text, className); if (null == type) { System.err.println(String.format( "Unknown type found. Please submit a bug report. (Text: %s, Name: %s, className: %s)", text, name, className)); continue; } try { final String linkPath = URLDecoder.decode(e.attr("href"), "UTF-8"); values.add(new SearchIndexValue(name, type, linkPath)); } catch (UnsupportedEncodingException ex) { throw new BuilderException("Error decoding a link", ex); } } return values; }
From source file:org.commonjava.maven.galley.transport.htcli.internal.HttpListing.java
@Override public ListingResult call() { request = new HttpGet(url); // return null if something goes wrong, after setting the error. // What we should be doing here is trying to retrieve the html directory // listing, then parse out the filenames from that... ///* ww w.ja v a 2 s .c o m*/ // They'll be links, so that's something to key in on. // // I'm wondering about this: // http://jsoup.org/cookbook/extracting-data/selector-syntax // the dependency is: org.jsoup:jsoup:1.7.2 ListingResult result = null; InputStream in = null; String oldName = Thread.currentThread().getName(); try { String newName = oldName + ": LIST " + url; Thread.currentThread().setName(newName); if (executeHttp()) { in = response.getEntity().getContent(); String listing = IOUtils.toString(in); Logger logger = LoggerFactory.getLogger(getClass()); logger.debug("Got raw listing content:\n\n{}\n\n", listing); final ArrayList<String> al = new ArrayList<>(); // TODO: Charset!! Document doc = Jsoup.parse(listing, url); // try // { // } // catch ( final IOException e ) // { // this.error = // new TransferLocationException( resource.getLocation(), "Invalid HTML in: {}. Reason: {}", e, url, e.getMessage() ); // } if (doc != null) { for (final Element link : doc.select("a")) { String linkText = link.text(); String linkHref = link.attr("href"); URL url = new URL(this.url); boolean sameServer = isSameServer(url, linkHref); boolean subpath = isSubpath(url, linkHref); if ((sameServer && subpath) && (linkHref.endsWith(linkText) || linkHref.endsWith(linkText + '/')) && !EXCLUDES.contains(linkText)) { al.add(linkText); } } result = new ListingResult(resource, al.toArray(new String[al.size()])); } } } catch (final TransferException e) { this.error = e; } catch (final IOException e) { this.error = new TransferException("Failed to construct directory listing for: {}. Reason: {}", e, url, e.getMessage()); } finally { closeQuietly(in); cleanup(); if (oldName != null) { Thread.currentThread().setName(oldName); } } return error == null ? result : null; }
From source file:gov.medicaid.screening.dao.impl.PharmacyLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param firstOrBusinessName First or Business name. * @param lastName Last name./*from w w w . ja v a 2 s.c o m*/ * @param licenseNumber License number. * @return the search result for licenses * @throws URISyntaxException When an error occurs while building the URL. * @throws ClientProtocolException When client does not support protocol used. * @throws IOException When an error occurs while parsing response. * @throws ParseException When an error occurs while parsing response. */ private SearchResult<License> getAllResults(String firstOrBusinessName, String lastName, String licenseNumber) throws URISyntaxException, ClientProtocolException, IOException, ParseException { DefaultHttpClient client = new DefaultHttpClient(); client.setRedirectStrategy(new LaxRedirectStrategy()); String path = "/mnbop/GLSuiteWeb/Clients/MNBOPharm/Public/"; URIBuilder builder = new URIBuilder(getSearchURL()).setPath(path + "LicenseeSearch.aspx"); HttpGet httpget = new HttpGet(builder.build()); HttpEntity entity = client.execute(httpget).getEntity(); Document page = Jsoup.parse(EntityUtils.toString(entity)); HttpPost httppost = new HttpPost(builder.build()); List<NameValuePair> parameters = new ArrayList<NameValuePair>(); parameters.add(new BasicNameValuePair("__VIEWSTATE", page.select("#__VIEWSTATE").first().val())); parameters.add( new BasicNameValuePair("__VIEWSTATEENCRYPTED", page.select("#__VIEWSTATEENCRYPTED").first().val())); parameters .add(new BasicNameValuePair("__EVENTVALIDATION", page.select("#__EVENTVALIDATION").first().val())); parameters.add(new BasicNameValuePair("ObjectID", page.select("#ObjectID").first().val())); parameters.add(new BasicNameValuePair("ObjectTypeID", page.select("#ObjectTypeID").first().val())); parameters.add(new BasicNameValuePair("waFirstName", Util.defaultString(firstOrBusinessName))); parameters.add(new BasicNameValuePair("waLastName", Util.defaultString(lastName))); parameters.add(new BasicNameValuePair("waLicenseNumber", Util.defaultString(licenseNumber))); httppost.setEntity(new UrlEncodedFormEntity(parameters, Charset.forName("UTF-8"))); HttpResponse postResponse = client.execute(httppost); entity = postResponse.getEntity(); // licenses list List<License> licenseList = new ArrayList<License>(); if (entity != null) { page = Jsoup.parse(EntityUtils.toString(entity)); Elements trs = page.select("table#DataTable a"); if (trs != null) { for (Element element : trs) { String href = element.attr("href"); HttpGet detailsGet = new HttpGet(getSearchURL() + path + href); HttpResponse detailsResponse = client.execute(detailsGet); HttpEntity detailsEntity = detailsResponse.getEntity(); if (detailsEntity != null) { Document details = Jsoup.parse(EntityUtils.toString(detailsEntity)); licenseList.add(parseLicense(details)); } } } } SearchResult<License> result = new SearchResult<License>(); result.setItems(licenseList); return result; }
From source file:org.keycloak.testsuite.util.saml.LoginBuilder.java
/** * Prepares a GET/POST request for logging the given user into the given login page. The login page is expected * to have at least input fields with id "username" and "password". * * @param user/*from ww w . ja v a2s . c o m*/ * @param loginPage * @return */ private HttpUriRequest handleLoginPage(String loginPage, URI currentURI) { if (idpAlias != null) { org.jsoup.nodes.Document theLoginPage = Jsoup.parse(loginPage); Element zocialLink = theLoginPage.getElementById("zocial-" + this.idpAlias); assertThat("Unknown idp: " + this.idpAlias, zocialLink, Matchers.notNullValue()); final String link = zocialLink.attr("href"); assertThat("Invalid idp link: " + this.idpAlias, link, Matchers.notNullValue()); return new HttpGet(currentURI.resolve(link)); } return handleLoginPage(user, loginPage); }
From source file:eu.masconsult.bgbanking.banks.sgexpress.SGExpressClient.java
private RawBankAccount obtainBankAccountFromHtmlTableRow(String type, Element row) { if ("detail".equalsIgnoreCase(row.attr("class"))) { // detail row return null; }//from w ww . j av a2 s .c o m if ("bg0".equalsIgnoreCase(row.attr("class"))) { Log.v(TAG, "working row(" + type + "): " + row.html()); if ("Current Accounts".equalsIgnoreCase(type)) { return new RawBankAccount().setServerId(row.child(2).text()).setName(row.child(0).child(0).text()) .setIBAN(row.child(2).text()).setCurrency(row.child(1).text()) .setBalance(Convert.strToFloat(row.child(3).text())) .setAvailableBalance(Convert.strToFloat(row.child(4).text())); } else if ("Cards".equalsIgnoreCase(type)) { // skip cards for now return null; } else { // unknown type return null; } } else { return null; } }
From source file:nl.phanos.liteliveresultsclient.LoginHandler.java
public List<NameValuePair> getFormParams(String html, String username, String password) throws UnsupportedEncodingException { Document doc = Jsoup.parse(html); // Google form id Element loginform = doc.getElementById("primarycontent"); Elements inputElements = loginform.getElementsByTag("input"); List<NameValuePair> paramList = new ArrayList<NameValuePair>(); for (Element inputElement : inputElements) { String key = inputElement.attr("name"); String value = inputElement.attr("value"); if (key.equals("email")) { value = username;// w ww . j a va2 s . co m } else if (key.equals("password")) { value = password; } paramList.add(new BasicNameValuePair(key, value)); } return paramList; }
From source file:me.vertretungsplan.parser.UntisSubstitutionParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); String encoding = data.optString(PARAM_ENCODING, null); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); int successfulSchedules = 0; HttpResponseException lastExceptionSchedule = null; for (String baseUrl : ParserUtils.handleUrlsWithDateFormat(urls)) { try {//from w w w . j a v a2 s . c o m Document doc = Jsoup.parse(this.httpGet(baseUrl, encoding)); Elements classes = doc.select("td a"); String lastChange = doc.select("td[align=right]:not(:has(b))").text(); int successfulClasses = 0; HttpResponseException lastExceptionClass = null; for (Element klasse : classes) { try { Document classDoc = Jsoup.parse( httpGet(baseUrl.substring(0, baseUrl.lastIndexOf("/")) + "/" + klasse.attr("href"), encoding)); parseSubstitutionTable(v, lastChange, classDoc); successfulClasses++; } catch (HttpResponseException e) { lastExceptionClass = e; } } if (successfulClasses == 0 && lastExceptionClass != null) { throw lastExceptionClass; } successfulSchedules++; } catch (HttpResponseException e) { lastExceptionSchedule = e; } } if (successfulSchedules == 0 && lastExceptionSchedule != null) { throw lastExceptionSchedule; } if (data.has(PARAM_WEBSITE)) { v.setWebsite(data.getString(PARAM_WEBSITE)); } else { v.setWebsite(urls.get(0)); } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }
From source file:net.acesinc.convergentui.ConvergentUIResponseFilter.java
@Override public Object run() { String origBody = contentManager.getDownstreamResponse(); if (origBody == null || origBody.isEmpty()) { return null; }/*from w w w . ja va 2 s . c o m*/ String composedBody = null; log.trace("Response from downstream server: " + origBody); Document doc = Jsoup.parse(origBody); if (hasReplaceableElements(doc)) { log.debug("We have replaceable elements. Let's get em!"); Elements elementsToUpdate = doc.select("div[data-loc]"); for (Element e : elementsToUpdate) { StringBuilder content = new StringBuilder(); String location = e.dataset().get("loc"); String fragmentName = e.dataset().get("fragment-name"); String cacheName = e.dataset().get("cache-name"); boolean useCaching = !Boolean.valueOf(e.dataset().get("disable-caching")); boolean failQuietly = Boolean.valueOf(e.dataset().get("fail-quietly")); URL url = null; try { url = new URL(location); String protocol = url.getProtocol(); String service = url.getHost(); log.debug("Fetching content at location [ " + location + " ] with cacheName = [ " + cacheName + " ]"); try { RequestContext context = RequestContext.getCurrentContext(); ContentResponse response = contentManager.getContentFromService(location, cacheName, useCaching, context); log.trace(response.toString()); if (!response.isError()) { Object resp = response.getContent(); if (String.class.isAssignableFrom(resp.getClass())) { String subContentResponse = (String) resp; //TODO You better trust the source of your downstream HTML! // String cleanedContent = Jsoup.clean(subContentResponse, Whitelist.basic()); //this totally stripped the html out... Document subDocument = Jsoup.parse(subContentResponse); if (fragmentName != null) { Elements fragments = subDocument .select("div[data-fragment-name=\"" + fragmentName + "\"]"); if (fragments != null && fragments.size() > 0) { if (fragments.size() == 1) { Element frag = fragments.first(); //need to see if there are images that we need to replace the urls on Elements images = frag.select("img"); for (Element i : images) { String src = i.attr("src"); if (src.startsWith("/") && !src.startsWith("//")) { i.attr("src", "/cui-req://" + protocol + "://" + service + src); } //else what do we do about relative urls? } content.append(frag.toString()); } else { for (Element frag : fragments) { content.append(frag.toString()).append("\n\n"); } } } else { log.debug("Found no matching fragments for [ " + fragmentName + " ]"); if (failQuietly) { content.append("<div class='cui-error'></div>"); } else { content.append( "<span class='cui-error'>Failed getting content from remote service. Possible reason in reponse below</span>"); content.append(subDocument.toString()); } } } else { //take the whole thing and cram it in there! content.append(subDocument.toString()); } } else { //not text... if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: content was not text</span>"); } else { content.append("<div class='cui-error'></div>"); } } } else { if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: " + response.getMessage() + "</span>"); } else { content.append("<div class='cui-error'></div>"); } } //now append it to the page if (!content.toString().isEmpty()) { e.html(content.toString()); } } catch (Throwable t) { if (!failQuietly) { e.html("<span class='cui-error'>Failed getting content from remote service. Reason: " + t.getMessage() + "</span>"); } log.warn("Failed replacing content", t); } } catch (MalformedURLException ex) { log.warn("location was invalid: [ " + location + " ]", ex); if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: data-loc was an invalid location.</span>"); } else { content.append("<div class='cui-error'></div>"); } } } composedBody = doc.toString(); } else { log.debug("Document has no replaeable elements. Skipping"); } try { addResponseHeaders(); if (composedBody != null && !composedBody.isEmpty()) { writeResponse(composedBody, getMimeType(RequestContext.getCurrentContext())); } else { writeResponse(origBody, getMimeType(RequestContext.getCurrentContext())); } } catch (Exception ex) { log.error("Error sending response", ex); } return null; }
From source file:me.vertretungsplan.parser.DSBMobileParser.java
private void loadScheduleFromUrl(SubstitutionSchedule v, String url, List<String> usedUrls) throws IOException, JSONException, CredentialInvalidException, IncompatibleScheduleException { usedUrls.add(url);/*from ww w.ja v a 2 s .c om*/ String html = httpGet(url, data.has(PARAM_ENCODING) ? data.optString(PARAM_ENCODING, null) : "UTF-8"); Document doc = Jsoup.parse(html); if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis") || data.optString(PARAM_TYPE, "").equals("untis")) { parseMultipleMonitorDays(v, doc, data); } else if (doc.html().toLowerCase().contains("created by davinci") || data.optString(PARAM_TYPE, "").equals("davinci")) { Elements titles = doc.select("h2"); Elements tables = doc.select("h2 + p + table"); if (titles.size() != tables.size()) throw new IOException("Anzahl berschriften != Anzahl Tabellen"); for (int i = 0; i < titles.size(); i++) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); String date = titles.get(i).text(); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); DaVinciParser.parseDaVinciTable(tables.get(i), v, day, colorProvider); v.addDay(day); } } else if (doc.select(".tdaktionen").size() > 0 || data.optString(PARAM_TYPE, "").equals("indiware")) { new IndiwareParser(scheduleData, cookieProvider).parseIndiwarePage(v, doc.html()); } else if (doc.text().matches(".*Fr diesen Bereich.*wurde kein Inhalt bereitgestellt\\.")) { return; } else { throw new IncompatibleScheduleException(); } if (doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!usedUrls.contains(redirectUrl)) { loadScheduleFromUrl(v, redirectUrl, usedUrls); } } }