List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:com.msds.km.service.Impl.YunmaiAPIDrivingLicenseRecognitionServcieiImpl.java
/** * html???//from w w w .j a v a 2 s .com * @param html ??xml?java * @return */ protected DrivingLicense parseDrivingLicense(String html) { if (html.isEmpty()) { throw new RecognitionException("the html content is empty"); } Document document = Jsoup.parse(html); if (document == null) { throw new RecognitionException( "the document prased from html content is null, please check the website"); } Elements fieldsets = document.select("div[class=left result] fieldset"); if (fieldsets.size() != 1) { throw new RecognitionException( "the document should has result filedset, the content of the web page may be changed."); } Element regResult = fieldsets.first(); String result = regResult.html().trim(); // String removedStr = "<legend></legend>"; if (result.startsWith(removedStr)) { result = result.substring(removedStr.length()); } //??xml result = StringEscapeUtils.unescapeXml(result); // result = "<drivingLicense>" + result + "</drivingLicense>"; return (DrivingLicense) stream.fromXML(result); }
From source file:net.liuxuan.Tools.signup.SignupQjvpn.java
public void getLoginForm() throws IOException { HttpGet httpget = new HttpGet("http://www.qjvpn.com/user/login.php"); CloseableHttpResponse response1 = httpclient.execute(httpget); try {/* w w w.j a v a 2 s . com*/ HttpEntity entity = response1.getEntity(); //?once String content = EntityUtils.toString(entity); // System.out.println(content); System.out.println("--------------"); System.out.println("--------------"); Document doc = Jsoup.parse(content); // Elements inputs = doc.select("input[type=text]"); Elements inputs = doc.select("input[type=hidden]"); for (int i = 0; i < inputs.size(); i++) { Element element = inputs.get(i); params.add(new BasicNameValuePair(element.attr("name"), element.attr("value"))); // params.put(element.attr("name"), element.attr("value")); System.out.println(element.toString()); System.out.println(element.attr("name")); System.out.println(element.attr("value")); } System.out.println("--------------"); System.out.println("--------------"); System.out.println("--------------"); System.out.println("--------------"); System.out.println("Login form get: " + response1.getStatusLine()); EntityUtils.consume(entity); System.out.println("Initial set of cookies:"); List<Cookie> cookies = cookieStore.getCookies(); if (cookies.isEmpty()) { System.out.println("None"); } else { for (int i = 0; i < cookies.size(); i++) { System.out.println("- " + cookies.get(i).toString()); } } } finally { response1.close(); } // HttpUriRequest login = RequestBuilder.post() // .setUri(new URI("http://v2ex.com/signin")) // .addParameter("u", "mosliu") // .addParameter("p", "mosesmoses") // .build(); // CloseableHttpResponse response2 = httpclient.execute(login); // try { // HttpEntity entity = response2.getEntity(); // // System.out.println("Login form get: " + response2.getStatusLine()); // // EntityUtils.consume(entity); // // System.out.println("Post logon cookies:"); // List<Cookie> cookies = cookieStore.getCookies(); // if (cookies.isEmpty()) { // System.out.println("None"); // } else { // for (int i = 0; i < cookies.size(); i++) { // System.out.println("- " + cookies.get(i).toString()); // } // } // // // // } finally { // response2.close(); // } // // // httpget = new HttpGet("http://v2ex.com/signin"); // response1 = httpclient.execute(httpget); // try { // HttpEntity entity = response1.getEntity(); // String content = EntityUtils.toString(entity); // System.out.println("-----------------content---------------------"); // System.out.println(content); // // EntityUtils.consume(entity); // } finally { // response1.close(); // } // // }
From source file:de.geeksfactory.opacclient.apis.Zones.java
static List<LentItem> parseMediaList(Document doc) { List<LentItem> lent = new ArrayList<>(); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd/MM/yyyy").withLocale(Locale.GERMAN); Pattern id_pat = Pattern.compile("javascript:renewItem\\('[0-9]+','(.*)'\\)"); Pattern cannotrenew_pat = Pattern.compile("javascript:CannotRenewLoan\\('[0-9]+','(.*)','[0-9]+'\\)"); for (Element table : doc .select(".LoansBrowseItemDetailsCellStripe table, " + ".LoansBrowseItemDetailsCell " + "table")) { LentItem item = new LentItem(); for (Element tr : table.select("tr")) { String desc = tr.select(".LoanBrowseFieldNameCell").text().trim(); String value = tr.select(".LoanBrowseFieldDataCell").text().trim(); if (desc.equals("Titel")) { item.setTitle(value);/* www. j ava 2 s .c om*/ if (tr.select(".LoanBrowseFieldDataCell a[href]").size() > 0) { String href = tr.select(".LoanBrowseFieldDataCell a[href]").attr("href"); Map<String, String> params = getQueryParamsFirst(href); if (params.containsKey("BACNO")) { item.setId(params.get("BACNO")); } } } if (desc.equals("Verfasser")) item.setAuthor(value); if (desc.equals("Mediennummer")) item.setBarcode(value); if (desc.equals("ausgeliehen in")) item.setHomeBranch(value); if (desc.matches("F.+lligkeits.*datum")) { value = value.split(" ")[0]; try { item.setDeadline(fmt.parseLocalDate(value)); } catch (IllegalArgumentException e) { e.printStackTrace(); } } } if (table.select(".button[Title~=Zum]").size() == 1) { Matcher matcher1 = id_pat.matcher(table.select(".button[Title~=Zum]").attr("href")); if (matcher1.matches()) item.setProlongData(matcher1.group(1)); } else if (table.select(".CannotRenewLink").size() == 1) { Matcher matcher = cannotrenew_pat.matcher(table.select(".CannotRenewLink").attr("href").trim()); if (matcher.matches()) { item.setProlongData("cannotrenew|" + matcher.group(1)); } item.setRenewable(false); } lent.add(item); } return lent; }
From source file:gov.medicaid.screening.dao.impl.HealthOccupationsProgramCredentialDAOBean.java
/** * Performs a search for all possible results. * * @param criteria The search criteria.//ww w . ja v a 2 s . co m * @return the search result for licenses * * @throws URISyntaxException if an error occurs while building the URL. * @throws ClientProtocolException if client does not support protocol used. * @throws IOException if an error occurs while parsing response. * @throws ParseException if an error occurs while parsing response. * @throws ServiceException for any other problems encountered */ private SearchResult<ProviderProfile> getAllResults(HealthOccupationsProgramCredentialSearchCriteria criteria) throws URISyntaxException, ClientProtocolException, IOException, ParseException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager()); client.setRedirectStrategy(new LaxRedirectStrategy()); HttpGet getSearch = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getSearch); verifyAndAuditCall(getSearchURL(), response); EntityUtils.consume(response.getEntity()); String fullSearchURL = Util.replaceLastURLPart(getSearchURL(), "credential_search.do"); HttpPost search = new HttpPost(new URIBuilder(fullSearchURL).build()); List<ProviderProfile> allProfiles = new ArrayList<ProviderProfile>(); HttpEntity entity = postForm(fullSearchURL, client, search, new String[][] { { "city", Util.defaultString(criteria.getCity()) }, { "credentialNumber", Util.defaultLongString(criteria.getCredentialNumber()) }, { "firstName", Util.defaultString(criteria.getFirstName()) }, { "lastName", Util.defaultString(criteria.getLastName()) }, { "county", getDefaultValue(criteria.getCounty()) }, { "credentialStatus", getDefaultValue(criteria.getStatus()) }, { "credentialType", getDefaultValue(criteria.getOccupationType()) }, { "discipline", getDefaultValue(criteria.getDiscipline()) }, { "state", getDefaultValue(criteria.getState()) }, { "p_action", "search" } }, false); Document page = Jsoup.parse(EntityUtils.toString(entity)); Elements rows = page.select("table.formTable tr:gt(0)"); for (Element row : rows) { allProfiles.add(parseProfile(row.children())); } SearchResult<ProviderProfile> searchResult = new SearchResult<ProviderProfile>(); searchResult.setItems(allProfiles); return searchResult; }
From source file:me.vertretungsplan.parser.ESchoolParser.java
@NotNull SubstitutionSchedule parseESchoolSchedule(Document doc) throws IOException, JSONException { SubstitutionSchedule schedule = SubstitutionSchedule.fromData(scheduleData); String infoString = doc.select("#Content table").first().select("td").get(1).ownText(); Pattern pattern = Pattern.compile("Letzte Aktualisierung:\u00a0(\\d{2}.\\d{2}.\\d{4} - \\d{2}:\\d{2})"); Matcher matcher = pattern.matcher(infoString); if (matcher.find()) { LocalDateTime lastChange = DateTimeFormat.forPattern("dd.MM.yyyy - HH:mm") .parseLocalDateTime(matcher.group(1)); schedule.setLastChange(lastChange); }/*from w w w .j ava 2 s .c o m*/ Elements titles = doc.select("center b"); Elements tables = doc.select("table#DATA"); if (!tables.get(0).text().contains("Keine Daten verfgbar")) { if (titles.size() != tables.size()) throw new IOException("Anzahl berschriften != Anzahl Tabellen"); for (int i = 0; i < titles.size(); i++) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); day.setDate(ParserUtils.parseDate(titles.get(i).text())); parseTable(tables.get(i), day); schedule.addDay(day); } } schedule.setClasses(getAllClasses()); schedule.setTeachers(getAllTeachers()); return schedule; }
From source file:me.vertretungsplan.parser.DSBLightParser.java
private void parseDay(String url, Map<String, String> referer, SubstitutionSchedule schedule, String startUrl) throws IOException, JSONException, CredentialInvalidException { String html = httpGet(url, data.optString(PARAM_ENCODING, null), referer); Document doc = Jsoup.parse(html); if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis") || doc.select(".mon_list").size() > 0) { parseMultipleMonitorDays(schedule, doc, data); if (doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl)) { parseDay(redirectUrl, referer, schedule, startUrl); }/* w w w . jav a2 s.c o m*/ } } }
From source file:com.illustrationfinder.process.post.HtmlPostProcessor.java
@Override public List<String> generateKeywords() { // TODO If two words are always close to each other, they should be considered as an expression and managed like one word if (this.url == null) return null; try {/*w ww.ja v a 2 s . c o m*/ // Retrieve the document and store it temporary try (final InputStream stream = this.url.openStream()) { final String rawText = IOUtils.toString(stream); // Retrieve useful HTML data final Document document = Jsoup.parse(rawText); String htmlTitle = document.title(); String htmlKeywords = document.select("meta[name=keywords]").text(); String htmlDescription = document.select("meta[name=description]").text(); // Extract the content of the raw text String content = ArticleExtractor.getInstance().getText(rawText); // Now we apply a simple algorithm to get keywords // 1) We remove all punctuation marks from the title // 2) We remove all words with less than 4 characters // 3) We remove excessive spacing and tabulations htmlTitle = htmlTitle.toLowerCase(); htmlTitle = htmlTitle.replaceAll(PUNCTUATION_REGEX, ""); htmlTitle = htmlTitle.replaceAll(WORD_WITH_LESS_THAN_4_CHARACTERS_REGEX, ""); htmlTitle = htmlTitle.replaceAll(EXCESSIVE_SPACING_REGEX, " "); final List<String> keywords = new ArrayList<>(); final List<String> keywordsList = Arrays.asList(htmlTitle.split(" ")); for (String tmp : keywordsList) { if (tmp.length() >= MINIMUM_WORD_LENGTH) { keywords.add(tmp); } } // If there is enough keywords, we return if (keywords.size() >= MINIMUM_KEYWORDS_COUNT) { return keywords; } else { // Otherwise, we look for more keywords from the text by taking the more frequent words content = content.toLowerCase(); content = content.replaceAll(PUNCTUATION_REGEX, ""); content = content.replaceAll(WORD_WITH_LESS_THAN_4_CHARACTERS_REGEX, ""); content = content.replaceAll(EXCESSIVE_SPACING_REGEX, " "); final Map<String, Integer> frequencies = new HashMap<>(); final String[] words = content.split(" "); // Count word frequencies for (final String word : words) { if (frequencies.containsKey(word)) { frequencies.put(word, frequencies.get(word) + 1); } else { frequencies.put(word, 1); } } // Sort the words per frequency final SortedMap<Integer, HashSet<String>> sortedWords = new TreeMap<>(); for (Map.Entry<String, Integer> entry : frequencies.entrySet()) { if (sortedWords.containsKey(entry.getValue())) { sortedWords.get(entry.getValue()).add(entry.getKey()); } else { final HashSet<String> set = new HashSet<>(); set.add(entry.getKey()); sortedWords.put(entry.getValue(), set); } } // Add the most frequent words until we reach the minimu keywords count while (keywords.size() < MINIMUM_KEYWORDS_COUNT) { final HashSet<String> set = sortedWords.get(sortedWords.lastKey()); final String keyword = set.iterator().next(); set.remove(keyword); if (set.size() == 0) { sortedWords.remove(sortedWords.lastKey()); } if (keyword.length() > MINIMUM_WORD_LENGTH) { keywords.add(keyword); } } return keywords; } } } catch (BoilerpipeProcessingException e) { // TODO e.printStackTrace(); } catch (IOException e) { // TODO e.printStackTrace(); } return null; }
From source file:web.analyzer.utils.Utils.java
public LinkResult getLinks(Document doc, String hostName) throws IOException { List<Link> linksInfo = new ArrayList<Link>(); int totalInternalLink = 0; int totalExternalLink = 0; Elements links = doc.select("a[href]"); for (Element link : links) { String href = link.attr("abs:href"); if (isValidUrl(href)) { URL url = new URL(href); String linkHostName = url.getHost(); String linkType = ""; if (linkHostName.equalsIgnoreCase(hostName)) { linkType = "internal"; totalInternalLink++;/* ww w . ja v a2 s .c om*/ } else { linkType = "external"; totalExternalLink++; } linksInfo.add(new Link(href, linkType)); } } return new LinkResult(linksInfo, totalInternalLink, totalExternalLink); }
From source file:com.amastigote.xdu.query.module.WaterAndElectricity.java
private List<String> query_metInfo() throws IOException { Document document = getPage("", METINFO_SUFFIX); Elements elements = document.select("td"); List<String> stringArrayList = new ArrayList<>(); for (Element td : elements) { String tmp = td.text();// ww w . j ava 2s.co m if (!"".equals(tmp)) { stringArrayList.add(tmp); } } for (int i = 0; i < stringArrayList.size(); i++) { stringArrayList.set(i, stringArrayList.get(i).substring(stringArrayList.get(i).indexOf("") + 1)); } /* * (stringArrayList): * - 0, ???? * - ? [ ?? | ? | ? ] * - , (3n), n??? * * - ?: ?null! */ return stringArrayList; }
From source file:me.vertretungsplan.parser.UntisMonitorParser.java
public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { loginResponse = new LoginHandler(scheduleData, credential, cookieProvider).handleLoginWithResponse(executor, cookieStore);//from w w w. j a va2s .co m SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); JSONArray urls = scheduleData.getData().getJSONArray(PARAM_URLS); String encoding = scheduleData.getData().optString(PARAM_ENCODING, null); List<Document> docs = new ArrayList<>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); final String urlStr = url.getString(SUBPARAM_URL); for (String dateUrl : ParserUtils.handleUrlWithDateFormat(urlStr)) { loadUrl(dateUrl, encoding, url.getBoolean(SUBPARAM_FOLLOWING), docs); } } for (Document doc : docs) { if (scheduleData.getData().has(PARAM_EMBEDDED_CONTENT_SELECTOR)) { for (Element part : doc.select(scheduleData.getData().getString(PARAM_EMBEDDED_CONTENT_SELECTOR))) { SubstitutionScheduleDay day = parseMonitorDay(part, scheduleData.getData()); v.addDay(day); } } else if (doc.title().contains("Untis") || doc.html().contains("<!--<title>Untis")) { SubstitutionScheduleDay day = parseMonitorDay(doc, scheduleData.getData()); v.addDay(day); } // else Error if (scheduleData.getData().has(PARAM_LAST_CHANGE_SELECTOR) && doc.select(scheduleData.getData().getString(PARAM_LAST_CHANGE_SELECTOR)).size() > 0) { String text = doc.select(scheduleData.getData().getString(PARAM_LAST_CHANGE_SELECTOR)).first() .text(); String lastChange; Pattern pattern = Pattern.compile("\\d\\d\\.\\d\\d\\.\\d\\d\\d\\d,? \\d\\d:\\d\\d"); Matcher matcher = pattern.matcher(text); if (matcher.find()) { lastChange = matcher.group(); } else { lastChange = text; } v.setLastChangeString(lastChange); v.setLastChange(ParserUtils.parseDateTime(lastChange)); } } if (scheduleData.getData().has(PARAM_WEBSITE)) { v.setWebsite(scheduleData.getData().getString(PARAM_WEBSITE)); } else if (urls.length() == 1) { v.setWebsite(urls.getJSONObject(0).getString("url")); } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }