List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private List<Actor> createOrGetActorList(Document doc) { List<Actor> actorList = Lists.newArrayList(); Elements keyElements = doc.select(".fm-minfo dt"); Elements valueElements = doc.select(".fm-minfo dd"); if (CollectionUtils.isNotEmpty(keyElements) && CollectionUtils.isNotEmpty(valueElements)) { int keyI = 0; for (; keyI < keyElements.size(); keyI++) { Element keyElement = keyElements.get(keyI); Element valueElement = valueElements.get(keyI); if (null != keyElement && null != valueElement) { String key = StringUtils.trimToEmpty(keyElement.text().toString()); if (StringUtils.isNotBlank(key)) { String value = StringUtils.trimToEmpty(valueElement.text().toString()); if (StringUtils.equalsIgnoreCase(key, "")) { Elements actorNameElements = valueElement.select("a"); if (CollectionUtils.isNotEmpty(actorNameElements)) { actorNameElements.forEach(actorNameElement -> { String actorName = StringUtils.trimToEmpty(actorNameElement.text().toString()); if (StringUtils.isNotBlank(actorName)) { Actor actor = createOrQueryActor(actorName); if (null != actor) { actorList.add(actor); }/* w w w. j a va 2 s . c o m*/ } }); } break; } } } } } return actorList; }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void addFilmRegionList(Document doc, Film film) { Elements keyElements = doc.select(".fm-minfo dt"); Elements valueElements = doc.select(".fm-minfo dd"); if (CollectionUtils.isNotEmpty(keyElements) && CollectionUtils.isNotEmpty(valueElements)) { int keyI = 0; for (; keyI < keyElements.size(); keyI++) { Element keyElement = keyElements.get(keyI); Element valueElement = valueElements.get(keyI); if (null != keyElement && null != valueElement) { String key = StringUtils.trimToEmpty(keyElement.text().toString()); if (StringUtils.isNotBlank(key)) { String value = StringUtils.trimToEmpty(valueElement.text().toString()); if (StringUtils.equalsIgnoreCase(key, "") && StringUtils.isNotBlank(value)) { List<String> regionList = SLASH_SPLITTER.splitToList(value); if (CollectionUtils.isNotEmpty(regionList)) { regionList.forEach(region -> { EnumRegion queryRegion = new EnumRegion(); queryRegion.setUrlRegion(region); EnumRegion enumRegion = enumRegionMapper .queryEnumRegionByEnumRegion(queryRegion); if (null != enumRegion) { FilmRegion filmRegion = new FilmRegion(); filmRegion.setFilmCode(film.getCode()); filmRegion.setRegionId(enumRegion.getId()); Date now = new Date(); filmRegion.setCreateTime(now); filmRegion.setUpdateTime(now); filmRegionMapper.insertSelective(filmRegion); }//from w ww . ja v a2s .co m }); } break; } } } } } }
From source file:biz.shadowservices.DegreesToolbox.DataFetcher.java
public FetchResult updateData(Context context, boolean force) { //Open database DBOpenHelper dbhelper = new DBOpenHelper(context); SQLiteDatabase db = dbhelper.getWritableDatabase(); // check for internet connectivity try {/* w ww.ja v a 2 s. co m*/ if (!isOnline(context)) { Log.d(TAG, "We do not seem to be online. Skipping Update."); return FetchResult.NOTONLINE; } } catch (Exception e) { exceptionReporter.reportException(Thread.currentThread(), e, "Exception during isOnline()"); } SharedPreferences sp = PreferenceManager.getDefaultSharedPreferences(context); if (!force) { try { if (sp.getBoolean("loginFailed", false) == true) { Log.d(TAG, "Previous login failed. Skipping Update."); DBLog.insertMessage(context, "i", TAG, "Previous login failed. Skipping Update."); return FetchResult.LOGINFAILED; } if (sp.getBoolean("autoupdates", true) == false) { Log.d(TAG, "Automatic updates not enabled. Skipping Update."); DBLog.insertMessage(context, "i", TAG, "Automatic updates not enabled. Skipping Update."); return FetchResult.NOTALLOWED; } if (!isBackgroundDataEnabled(context) && sp.getBoolean("obeyBackgroundData", true)) { Log.d(TAG, "Background data not enabled. Skipping Update."); DBLog.insertMessage(context, "i", TAG, "Background data not enabled. Skipping Update."); return FetchResult.NOTALLOWED; } if (!isAutoSyncEnabled() && sp.getBoolean("obeyAutoSync", true) && sp.getBoolean("obeyBackgroundData", true)) { Log.d(TAG, "Auto sync not enabled. Skipping Update."); DBLog.insertMessage(context, "i", TAG, "Auto sync not enabled. Skipping Update."); return FetchResult.NOTALLOWED; } if (isWifi(context) && !sp.getBoolean("wifiUpdates", true)) { Log.d(TAG, "On wifi, and wifi auto updates not allowed. Skipping Update"); DBLog.insertMessage(context, "i", TAG, "On wifi, and wifi auto updates not allowed. Skipping Update"); return FetchResult.NOTALLOWED; } else if (!isWifi(context)) { Log.d(TAG, "We are not on wifi."); if (!isRoaming(context) && !sp.getBoolean("2DData", true)) { Log.d(TAG, "Automatic updates on 2Degrees data not enabled. Skipping Update."); DBLog.insertMessage(context, "i", TAG, "Automatic updates on 2Degrees data not enabled. Skipping Update."); return FetchResult.NOTALLOWED; } else if (isRoaming(context) && !sp.getBoolean("roamingData", false)) { Log.d(TAG, "Automatic updates on roaming mobile data not enabled. Skipping Update."); DBLog.insertMessage(context, "i", TAG, "Automatic updates on roaming mobile data not enabled. Skipping Update."); return FetchResult.NOTALLOWED; } } } catch (Exception e) { exceptionReporter.reportException(Thread.currentThread(), e, "Exception while finding if to update."); } } else { Log.d(TAG, "Update Forced"); } try { String username = sp.getString("username", null); String password = sp.getString("password", null); if (username == null || password == null) { DBLog.insertMessage(context, "i", TAG, "Username or password not set."); return FetchResult.USERNAMEPASSWORDNOTSET; } // Find the URL of the page to send login data to. Log.d(TAG, "Finding Action. "); HttpGetter loginPageGet = new HttpGetter("https://secure.2degreesmobile.co.nz/web/ip/login"); String loginPageString = loginPageGet.execute(); if (loginPageString != null) { Document loginPage = Jsoup.parse(loginPageString, "https://secure.2degreesmobile.co.nz/web/ip/login"); Element loginForm = loginPage.getElementsByAttributeValue("name", "loginFrm").first(); String loginAction = loginForm.attr("action"); // Send login form List<NameValuePair> loginValues = new ArrayList<NameValuePair>(); loginValues.add(new BasicNameValuePair("externalURLRedirect", "")); loginValues.add(new BasicNameValuePair("hdnAction", "login_userlogin")); loginValues.add(new BasicNameValuePair("hdnAuthenticationType", "M")); loginValues.add(new BasicNameValuePair("hdnlocale", "")); loginValues.add(new BasicNameValuePair("userid", username)); loginValues.add(new BasicNameValuePair("password", password)); Log.d(TAG, "Sending Login "); HttpPoster sendLoginPoster = new HttpPoster(loginAction, loginValues); // Parse result String loginResponse = sendLoginPoster.execute(); Document loginResponseParsed = Jsoup.parse(loginResponse); // Determine if this is a pre-pay or post-paid account. boolean postPaid; if (loginResponseParsed .getElementById("p_CustomerPortalPostPaidHomePage_WAR_customerportalhomepage") == null) { Log.d(TAG, "Pre-pay account or no account."); postPaid = false; } else { Log.d(TAG, "Post-paid account."); postPaid = true; } String homepageUrl = "https://secure.2degreesmobile.co.nz/group/ip/home"; if (postPaid) { homepageUrl = "https://secure.2degreesmobile.co.nz/group/ip/postpaid"; } HttpGetter homepageGetter = new HttpGetter(homepageUrl); String homepageHTML = homepageGetter.execute(); Document homePage = Jsoup.parse(homepageHTML); Element accountSummary = homePage.getElementById("accountSummary"); if (accountSummary == null) { Log.d(TAG, "Login failed."); return FetchResult.LOGINFAILED; } db.delete("cache", "", null); /* This code fetched some extra details for postpaid users, but on reflection they aren't that useful. * Might reconsider this. * if (postPaid) { Element accountBalanceSummaryTable = accountSummary.getElementsByClass("tableBillSummary").first(); Elements rows = accountBalanceSummaryTable.getElementsByTag("tr"); int rowno = 0; for (Element row : rows) { if (rowno > 1) { break; } //Log.d(TAG, "Starting row"); //Log.d(TAG, row.html()); Double value; try { Element amount = row.getElementsByClass("tableBillamount").first(); String amountHTML = amount.html(); Log.d(TAG, amountHTML.substring(1)); value = Double.parseDouble(amountHTML.substring(1)); } catch (Exception e) { Log.d(TAG, "Failed to parse amount from row."); value = null; } String expiresDetails = ""; String expiresDate = null; String name = null; try { Element details = row.getElementsByClass("tableBilldetail").first(); name = details.ownText(); Element expires = details.getElementsByTag("em").first(); if (expires != null) { expiresDetails = expires.text(); } Log.d(TAG, expiresDetails); Pattern pattern; pattern = Pattern.compile("\\(payment is due (.*)\\)"); Matcher matcher = pattern.matcher(expiresDetails); if (matcher.find()) { /*Log.d(TAG, "matched expires"); Log.d(TAG, "group 0:" + matcher.group(0)); Log.d(TAG, "group 1:" + matcher.group(1)); Log.d(TAG, "group 2:" + matcher.group(2)); * String expiresDateString = matcher.group(1); Date expiresDateObj; if (expiresDateString != null) { if (expiresDateString.length() > 0) { try { expiresDateObj = DateFormatters.EXPIRESDATE.parse(expiresDateString); expiresDate = DateFormatters.ISO8601DATEONLYFORMAT.format(expiresDateObj); } catch (java.text.ParseException e) { Log.d(TAG, "Could not parse date: " + expiresDateString); } } } } } catch (Exception e) { Log.d(TAG, "Failed to parse details from row."); } String expirev = null; ContentValues values = new ContentValues(); values.put("name", name); values.put("value", value); values.put("units", "$NZ"); values.put("expires_value", expirev ); values.put("expires_date", expiresDate); db.insert("cache", "value", values ); rowno++; } } */ Element accountSummaryTable = accountSummary.getElementsByClass("tableAccountSummary").first(); Elements rows = accountSummaryTable.getElementsByTag("tr"); for (Element row : rows) { // We are now looking at each of the rows in the data table. //Log.d(TAG, "Starting row"); //Log.d(TAG, row.html()); Double value; String units; try { Element amount = row.getElementsByClass("tableBillamount").first(); String amountHTML = amount.html(); //Log.d(TAG, amountHTML); String[] amountParts = amountHTML.split(" ", 2); //Log.d(TAG, amountParts[0]); //Log.d(TAG, amountParts[1]); if (amountParts[0].contains("Included") || amountParts[0].equals("All You Need") || amountParts[0].equals("Unlimited Text*")) { value = Values.INCLUDED; } else { try { value = Double.parseDouble(amountParts[0]); } catch (NumberFormatException e) { exceptionReporter.reportException(Thread.currentThread(), e, "Decoding value."); value = 0.0; } } units = amountParts[1]; } catch (NullPointerException e) { //Log.d(TAG, "Failed to parse amount from row."); value = null; units = null; } Element details = row.getElementsByClass("tableBilldetail").first(); String name = details.getElementsByTag("strong").first().text(); Element expires = details.getElementsByTag("em").first(); String expiresDetails = ""; if (expires != null) { expiresDetails = expires.text(); } Log.d(TAG, expiresDetails); Pattern pattern; if (postPaid == false) { pattern = Pattern.compile("\\(([\\d\\.]*) ?\\w*? ?expiring on (.*)\\)"); } else { pattern = Pattern.compile("\\(([\\d\\.]*) ?\\w*? ?will expire on (.*)\\)"); } Matcher matcher = pattern.matcher(expiresDetails); Double expiresValue = null; String expiresDate = null; if (matcher.find()) { /*Log.d(TAG, "matched expires"); Log.d(TAG, "group 0:" + matcher.group(0)); Log.d(TAG, "group 1:" + matcher.group(1)); Log.d(TAG, "group 2:" + matcher.group(2)); */ try { expiresValue = Double.parseDouble(matcher.group(1)); } catch (NumberFormatException e) { expiresValue = null; } String expiresDateString = matcher.group(2); Date expiresDateObj; if (expiresDateString != null) { if (expiresDateString.length() > 0) { try { expiresDateObj = DateFormatters.EXPIRESDATE.parse(expiresDateString); expiresDate = DateFormatters.ISO8601DATEONLYFORMAT.format(expiresDateObj); } catch (java.text.ParseException e) { Log.d(TAG, "Could not parse date: " + expiresDateString); } } } } ContentValues values = new ContentValues(); values.put("name", name); values.put("value", value); values.put("units", units); values.put("expires_value", expiresValue); values.put("expires_date", expiresDate); db.insert("cache", "value", values); } if (postPaid == false) { Log.d(TAG, "Getting Value packs..."); // Find value packs HttpGetter valuePacksPageGet = new HttpGetter( "https://secure.2degreesmobile.co.nz/group/ip/prevaluepack"); String valuePacksPageString = valuePacksPageGet.execute(); //DBLog.insertMessage(context, "d", "", valuePacksPageString); if (valuePacksPageString != null) { Document valuePacksPage = Jsoup.parse(valuePacksPageString); Elements enabledPacks = valuePacksPage.getElementsByClass("yellow"); for (Element enabledPack : enabledPacks) { Element offerNameElemt = enabledPack .getElementsByAttributeValueStarting("name", "offername").first(); if (offerNameElemt != null) { String offerName = offerNameElemt.val(); DBLog.insertMessage(context, "d", "", "Got element: " + offerName); ValuePack[] packs = Values.valuePacks.get(offerName); if (packs == null) { DBLog.insertMessage(context, "d", "", "Offer name: " + offerName + " not matched."); } else { for (ValuePack pack : packs) { ContentValues values = new ContentValues(); values.put("plan_startamount", pack.value); values.put("plan_name", offerName); DBLog.insertMessage(context, "d", "", "Pack " + pack.type.id + " start value set to " + pack.value); db.update("cache", values, "name = '" + pack.type.id + "'", null); } } } } } } SharedPreferences.Editor prefedit = sp.edit(); Date now = new Date(); prefedit.putString("updateDate", DateFormatters.ISO8601FORMAT.format(now)); prefedit.putBoolean("loginFailed", false); prefedit.putBoolean("networkError", false); prefedit.commit(); DBLog.insertMessage(context, "i", TAG, "Update Successful"); return FetchResult.SUCCESS; } } catch (ClientProtocolException e) { DBLog.insertMessage(context, "w", TAG, "Network error: " + e.getMessage()); return FetchResult.NETWORKERROR; } catch (IOException e) { DBLog.insertMessage(context, "w", TAG, "Network error: " + e.getMessage()); return FetchResult.NETWORKERROR; } finally { db.close(); } return null; }
From source file:de.geeksfactory.opacclient.apis.Zones22.java
@Override public ReservationResult reservation(DetailledItem item, Account acc, int useraction, String selection) throws IOException { String reservation_info = item.getReservation_info(); String html = httpGet(opac_url + "/" + reservation_info, getDefaultEncoding()); Document doc = Jsoup.parse(html); if (html.contains("Geheimnummer")) { List<NameValuePair> params = new ArrayList<NameValuePair>(); for (Element input : doc.select("#MainForm input")) { if (!input.attr("name").equals("BRWR") && !input.attr("name").equals("PIN")) { params.add(new BasicNameValuePair(input.attr("name"), input.attr("value"))); }/*from w w w . j ava 2 s.c o m*/ } params.add(new BasicNameValuePair("BRWR", acc.getName())); params.add(new BasicNameValuePair("PIN", acc.getPassword())); html = httpGet(opac_url + "/" + doc.select("#MainForm").attr("action") + "?" + URLEncodedUtils.format(params, getDefaultEncoding()), getDefaultEncoding()); doc = Jsoup.parse(html); } if (useraction == ReservationResult.ACTION_BRANCH) { List<NameValuePair> params = new ArrayList<NameValuePair>(); for (Element input : doc.select("#MainForm input")) { if (!input.attr("name").equals("Confirm")) { params.add(new BasicNameValuePair(input.attr("name"), input.attr("value"))); } } params.add(new BasicNameValuePair("MakeResTypeDef.Reservation.RecipientLocn", selection)); params.add(new BasicNameValuePair("Confirm", "1")); html = httpGet(opac_url + "/" + doc.select("#MainForm").attr("action") + "?" + URLEncodedUtils.format(params, getDefaultEncoding()), getDefaultEncoding()); return new ReservationResult(MultiStepResult.Status.OK); } if (useraction == 0) { ReservationResult res = null; for (Node n : doc.select("#MainForm").first().childNodes()) { if (n instanceof TextNode) { if (((TextNode) n).text().contains("Entgelt")) { res = new ReservationResult(ReservationResult.Status.CONFIRMATION_NEEDED); List<String[]> details = new ArrayList<String[]>(); details.add(new String[] { ((TextNode) n).text().trim() }); res.setDetails(details); res.setMessage(((TextNode) n).text().trim()); res.setActionIdentifier(MultiStepResult.ACTION_CONFIRMATION); } } } if (res != null) return res; } if (doc.select("#MainForm select").size() > 0) { ReservationResult res = new ReservationResult(ReservationResult.Status.SELECTION_NEEDED); Map<String, String> sel = new HashMap<String, String>(); for (Element opt : doc.select("#MainForm select option")) { sel.put(opt.attr("value"), opt.text().trim()); } res.setSelection(sel); res.setMessage("Bitte Zweigstelle auswhlen"); res.setActionIdentifier(ReservationResult.ACTION_BRANCH); return res; } return new ReservationResult(ReservationResult.Status.ERROR); }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void setFilmRelated(Document doc, Film film) { Elements keyElements = doc.select(".fm-minfo dt"); Elements valueElements = doc.select(".fm-minfo dd"); if (CollectionUtils.isNotEmpty(keyElements) && CollectionUtils.isNotEmpty(valueElements)) { int keyI = 0; for (; keyI < keyElements.size(); keyI++) { Element keyElement = keyElements.get(keyI); Element valueElement = valueElements.get(keyI); if (null != keyElement && null != valueElement) { String key = StringUtils.trimToEmpty(keyElement.text().toString()); if (StringUtils.isNotBlank(key)) { String value = StringUtils.trimToEmpty(valueElement.text().toString()); if (StringUtils.equalsIgnoreCase(key, "")) { Director director = createOrQueryDirector(value); if (null != director) { film.setDirectorId(director.getId()); }/*from w w w . j a v a2s. co m*/ film.setDirector(value); } if (StringUtils.equalsIgnoreCase(key, "")) { } if (StringUtils.equalsIgnoreCase(key, "")) { } if (StringUtils.equalsIgnoreCase(key, "")) { String urlYear = getFilmUrlYear(doc, value); if (StringUtils.isNotBlank(urlYear)) { EnumYear enumYear = queryEnumYear(urlYear); if (null != enumYear) { film.setYearId(enumYear.getId()); } } Date releaseDate = getFilmReleaseDate(value); if (null != releaseDate) { film.setReleaseDate(releaseDate); } } if (StringUtils.equalsIgnoreCase(key, "")) { int length = getFilmLength(value); film.setLength(length); } if (StringUtils.equalsIgnoreCase(key, "??")) { if (StringUtils.isNotBlank(value)) { film.setAlias(value); } } } } } } }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public List<SearchField> getSearchFields() throws IOException, OpacErrorException, JSONException { String html = httpGet(opac_url + "/search.cgi?art=f", ENCODING, false, cookieStore); Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);//from w w w. j av a 2s. c om List<SearchField> fields = new ArrayList<>(); Elements options = doc.select("select[name=kat1] option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setDisplayName(option.text()); field.setId(option.attr("value")); field.setHint(""); fields.add(field); } DropdownSearchField field = new DropdownSearchField(); Elements zst_opts = doc.select("#teilk2 option"); for (int i = 0; i < zst_opts.size(); i++) { Element opt = zst_opts.get(i); if (!opt.val().equals("")) { field.addDropdownValue(opt.val(), opt.text()); } } field.setDisplayName("Einrichtung"); field.setId("f[teil2]"); field.setVisible(true); field.setMeaning(SearchField.Meaning.BRANCH); fields.add(field); try { field = new DropdownSearchField(); Document doc2 = Jsoup .parse(httpGet(opac_url + "/zweigstelle.cgi?sess=" + sessid, ENCODING, false, cookieStore)); Elements home_opts = doc2.select("#zweig option"); for (int i = 0; i < home_opts.size(); i++) { Element opt = home_opts.get(i); if (!opt.val().equals("")) { Map<String, String> option = new HashMap<>(); option.put("key", opt.val()); option.put("value", opt.text()); field.addDropdownValue(opt.val(), opt.text()); } } field.setDisplayName("Leihstelle"); field.setId("_heidi_branch"); field.setVisible(true); field.setMeaning(SearchField.Meaning.HOME_BRANCH); fields.add(field); } catch (IOException e) { e.printStackTrace(); } TextSearchField pagefield = new TextSearchField(); pagefield.setId("_heidi_page"); pagefield.setVisible(false); pagefield.setDisplayName("Seite"); pagefield.setHint(""); fields.add(pagefield); return fields; }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
void parseDay(SubstitutionScheduleDay day, Element next, SubstitutionSchedule v, String klasse) throws JSONException, CredentialInvalidException { if (next.className().equals("subst") || next.select(".list").size() > 0 || next.text().contains("Vertretungen sind nicht freigegeben") || next.text().contains("Keine Vertretungen")) { //Vertretungstabelle if (next.text().contains("Vertretungen sind nicht freigegeben")) { return; }/* ww w. j a v a 2 s. c om*/ parseSubstitutionScheduleTable(next, scheduleData.getData(), day, klasse); } else { //Nachrichten parseMessages(next, day); next = next.nextElementSibling().nextElementSibling(); parseSubstitutionScheduleTable(next, scheduleData.getData(), day, klasse); } v.addDay(day); }
From source file:de.geeksfactory.opacclient.apis.Pica.java
@Override public List<SearchField> getSearchFields() throws IOException, JSONException { if (!initialised) { start();// w w w.j a v a 2s . c om } String html = httpGet(opac_url + "/LNG=" + getLang() + "/DB=" + db + "/ADVANCED_SEARCHFILTER", getDefaultEncoding()); Document doc = Jsoup.parse(html); List<SearchField> fields = new ArrayList<>(); Elements options = doc.select("select[name=IKT0] option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setDisplayName(option.text()); field.setId(option.attr("value")); field.setHint(""); field.setData(new JSONObject("{\"ADI\": false}")); Pattern pattern = Pattern.compile("\\[X?[A-Za-z]{2,3}:?\\]|\\(X?[A-Za-z]{2,3}:?\\)"); Matcher matcher = pattern.matcher(field.getDisplayName()); if (matcher.find()) { field.getData().put("meaning", matcher.group().replace(":", "").toUpperCase()); field.setDisplayName(matcher.replaceFirst("").trim()); } fields.add(field); } Elements sort = doc.select("select[name=SRT]"); if (sort.size() > 0) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(sort.first().parent().parent().select(".longval").first().text()); field.setId("SRT"); for (Element option : sort.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } fields.add(field); } for (Element input : doc.select("input[type=text][name^=ADI]")) { TextSearchField field = new TextSearchField(); field.setDisplayName(input.parent().parent().select(".longkey").text()); field.setId(input.attr("name")); field.setHint(input.parent().select("span").text()); field.setData(new JSONObject("{\"ADI\": true}")); fields.add(field); } for (Element dropdown : doc.select("select[name^=ADI]")) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(dropdown.parent().parent().select(".longkey").text()); field.setId(dropdown.attr("name")); for (Element option : dropdown.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } fields.add(field); } Elements fuzzy = doc.select("input[name=FUZZY]"); if (fuzzy.size() > 0) { CheckboxSearchField field = new CheckboxSearchField(); field.setDisplayName(fuzzy.first().parent().parent().select(".longkey").first().text()); field.setId("FUZZY"); fields.add(field); } Elements mediatypes = doc.select("input[name=ADI_MAT]"); if (mediatypes.size() > 0) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName("Materialart"); field.setId("ADI_MAT"); field.addDropdownValue("", "Alle"); for (Element mt : mediatypes) { field.addDropdownValue(mt.attr("value"), mt.parent().nextElementSibling().text().replace("\u00a0", "")); } fields.add(field); } return fields; }
From source file:eu.sisob.uma.extractors.adhoc.cvfilesinside.InternalCVFilesExtractor.java
/** * * @param input_file/* w ww .ja v a 2s.c om*/ * @param data_dir * @param output_file * @param error_sw */ public static void extract_cv_files(File input_file, File data_dir, File output_file/*, File output_file_2, File results_dir,*/, StringWriter error_sw) { CSVReader reader = null; try { reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); } int idStaffIdentifier = -1; int idName = -1; int idFirstName = -1; int idLastName = -1; int idInitials = -1; int idUnitOfAssessment_Description = -1; int idInstitutionName = -1; int idWebAddress = -1; int idResearchGroupDescription = -1; int idResearcherWebAddress = -1; int idResearcherWebAddressType = -1; int idResearcherWebAddressExt = -1; int idScoreUrl = -1; int idEmail = -1; int idScoreEmail = -1; String[] nextLine; try { if ((nextLine = reader.readNext()) != null) { //Locate indexes //Locate indexes for (int i = 0; i < nextLine.length; i++) { String column_name = nextLine[i]; if (column_name.equals(FileFormatConversor.CSV_COL_ID)) idStaffIdentifier = i; else if (column_name.equals(FileFormatConversor.CSV_COL_NAME)) idName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME)) idFirstName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME)) idLastName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS)) idInitials = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT)) idUnitOfAssessment_Description = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME)) idInstitutionName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL)) idWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL)) idResearcherWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE)) idResearcherWebAddressType = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT)) idResearcherWebAddressExt = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL)) idScoreUrl = i; else if (column_name.equals(FileFormatConversor.CSV_COL_EMAIL)) idEmail = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_EMAIL)) idScoreEmail = i; } } } catch (Exception ex) { String error_msg = "Error reading headers of " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } if (idResearcherWebAddress != -1 && idResearcherWebAddressType != -1 && idResearcherWebAddressExt != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) { if (true) { try { String header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_FIRSTNAME + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; if (idEmail != -1) header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"" + CSV_SEPARATOR; header += "\r\n"; FileUtils.write(output_file, header, "UTF-8", false); // DOWNLOAD HERE THE HOME PAGE //FileUtils.write(output_file_2, header, "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); error_sw.append("Error creating output files\r\n"); } } try { // DOWNLOAD HERE THE HOME PAGE // if(!results_dir.exists()) // results_dir.mkdirs(); // File homepage_results_dirs = new File(results_dir, "HOMEPAGE"); // if(!homepage_results_dirs.exists()) // homepage_results_dirs.mkdirs(); //if(!test_only_output) { Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+"); while ((nextLine = reader.readNext()) != null) { nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase(); if (idFirstName != -1) nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ") .toLowerCase(); if (idName != -1) nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); Document content = null; String researcher_page_url = nextLine[idResearcherWebAddress]; File temp_file = null; if (p1.matcher(researcher_page_url).matches()) { } else { try { Logger.getRootLogger().info("Reading " + researcher_page_url); temp_file = File.createTempFile("internal-cv-files-", ".tmp"); URL fetched_url = Downloader.fetchURL(researcher_page_url); FileUtils.copyURLToFile(fetched_url, temp_file); long sizeInBytes = temp_file.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { content = null; } else { String text_content = FileUtils.readFileToString(temp_file); String check_string = ""; if (text_content.length() <= 100) { check_string = text_content.substring(0, text_content.length()); } else { check_string = text_content.substring(0, 100); } if (check_string.toLowerCase().contains("html")) { content = Jsoup.parse(text_content); content.setBaseUri(researcher_page_url); // DOWNLOAD HERE THE HOME PAGE // String filename = nextLine[idStaffIdentifier] + "_HOMEPAGE_" + MD5(researcher_page_url) + ".html"; // FileUtils.copyFile(temp_file, new File(homepage_results_dirs, filename)); // // String result = ""; // result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; // if(idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; // if(idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; // if(idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; // if(idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; // if(idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; // result += "\"" + filename + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; // result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR; // if(idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; // result += "\r\n"; // // try { // FileUtils.write(output_file_2, result, "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } else { throw new Exception(researcher_page_url + " is not html document"); } } } catch (Exception ex) { Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex); error_sw.append("" + researcher_page_url + " could not loaded"); content = null; } catch (java.lang.OutOfMemoryError ex2) { Logger.getLogger("root") .error("" + researcher_page_url + " could not loaded (out of memory)", ex2); error_sw.append("" + researcher_page_url + " could not loaded (out of memory)"); content = null; } finally { if (temp_file != null) temp_file.delete(); } } //Add sources to output { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; result += "\"HOMEPAGE\"" + CSV_SEPARATOR; result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } if (content != null) { Elements links = content.select("a[href]"); Elements links_worepeat = new Elements(); for (Element link : links) { boolean b = false; for (Element link_worepeat : links_worepeat) { if (link.absUrl("href").equals(link_worepeat.absUrl("href"))) { b = true; break; } } if (!b) links_worepeat.add(link); } for (Element link : links_worepeat) { boolean b = false; link.setBaseUri(researcher_page_url); String clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase(); for (String k : cv_keywords_in_name_list) { if (clean_name_1.contains(k)) { b = true; break; } } if (b) { Logger.getRootLogger() .info("CV found " + link.absUrl("href") + " (" + link.text() + ")"); String href = link.absUrl("href"); String ext = ""; String score = ""; String type = "CV"; if (link.absUrl("href").endsWith(".pdf")) ext = "PDF"; else if (link.absUrl("href").endsWith(".doc")) ext = "DOC"; else if (link.absUrl("href").endsWith(".docx")) ext = "DOCX"; else if (link.absUrl("href").endsWith(".rtf")) ext = "RTF"; else if (link.absUrl("href").endsWith(".txt")) ext = "TXT"; else ext = "HTML"; if (ext.equals("HTML")) { score = "B"; } else { score = "A"; } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + ext + "\"" + CSV_SEPARATOR; result += "\"" + type + "\"" + CSV_SEPARATOR; result += "\"" + score + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } b = false; link.setBaseUri(researcher_page_url); clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase(); for (String k : pub_keywords_in_name_list) { if (clean_name_1.contains(k)) { b = true; break; } } if (b) { Logger.getRootLogger() .info("PUB found " + link.absUrl("href") + " (" + link.text() + ")"); String href = link.absUrl("href"); String ext = ""; String score = ""; String type = "PUB"; if (link.absUrl("href").endsWith(".pdf")) ext = "PDF"; else if (link.absUrl("href").endsWith(".doc")) ext = "DOC"; else if (link.absUrl("href").endsWith(".docx")) ext = "DOCX"; else if (link.absUrl("href").endsWith(".rtf")) ext = "RTF"; else if (link.absUrl("href").endsWith(".txt")) ext = "TXT"; else ext = "HTML"; if (ext.equals("HTML")) { score = "-"; } else { score = "-"; } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + ext + "\"" + CSV_SEPARATOR; result += "\"" + type + "\"" + CSV_SEPARATOR; result += "\"" + score + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } } } reader.close(); } // reader = null; // try { // reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR); // } catch (FileNotFoundException ex) { // Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); // } // // reader.readNext(); // // int newIdResearcherWebpage = 3; // if(idFirstName != -1) newIdResearcherWebpage++; // if(idName != -1) newIdResearcherWebpage++; // if(idEmail != -1) newIdResearcherWebpage++; // if(idInstitutionName != -1) newIdResearcherWebpage++; // if(idWebAddress != -1) newIdResearcherWebpage++; // // List<Object[]> urls_times = new ArrayList<Object[]>(); // while ((nextLine = reader.readNext()) != null) // { // String url = nextLine[newIdResearcherWebpage]; // // Object[] url_time = new Object[2]; // url_time[0] = url; // boolean b = false; // for(Object[] u : urls_times){ // if(u[0].equals(url_time[0])){ // u[1] = (Integer)u[1] + 1; // b = true; // break; // } // } // // if(!b){ // url_time[1] = new Integer(1); // urls_times.add(url_time); // } // } // // reader.close(); // try { // reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR); // } catch (FileNotFoundException ex) { // Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); // } // // nextLine = reader.readNext(); // try { // for(int i = 0; i < nextLine.length; i++) // nextLine[i] = "\"" + nextLine[i] + "\""; // FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", false); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } // // while ((nextLine = reader.readNext()) != null) // { // String url = nextLine[newIdResearcherWebpage]; // boolean b = false; // for(Object[] u : urls_times){ // if(u[0].equals(url) && ((Integer)u[1] == 1)){ // b = true; // break; // } // } // // if(b){ // try { // for(int i = 0; i < nextLine.length; i++) // nextLine[i] = "\"" + nextLine[i] + "\""; // FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } // } // } // // reader.close(); } catch (Exception ex) { String error_msg = "Error extracting cv files from extractor " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } } }
From source file:de.geeksfactory.opacclient.apis.Open.java
@Override public List<SearchField> getSearchFields() throws IOException, OpacErrorException, JSONException { String url = opac_url + "/" + data.getJSONObject("urls").getString("advanced_search") + NO_MOBILE; Document doc = Jsoup.parse(httpGet(url, getDefaultEncoding())); Element table = doc.select(".ModOPENExtendedSearchModuleC table").first(); List<SearchField> fields = new ArrayList<>(); JSONObject selectable = new JSONObject(); selectable.put("selectable", true); JSONObject notSelectable = new JSONObject(); notSelectable.put("selectable", false); // Selectable search criteria Elements options = table.select("select[id$=FirstSearchField] option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setId(option.val()); field.setDisplayName(option.text()); field.setData(selectable);/*from w ww . ja v a2s . c o m*/ fields.add(field); } // More criteria Element moreHeader = table.select("span[id$=LblMoreCriterias]").parents().select("tr").first(); if (moreHeader != null) { Elements siblings = moreHeader.siblingElements(); int startIndex = moreHeader.elementSiblingIndex(); for (int i = startIndex; i < siblings.size(); i++) { Element tr = siblings.get(i); if (tr.select("input, select").size() == 0) continue; if (tr.select("input[type=text]").size() == 1) { Element input = tr.select("input[type=text]").first(); TextSearchField field = new TextSearchField(); field.setId(input.attr("name")); field.setDisplayName(tr.select("span[id*=Lbl]").first().text()); field.setData(notSelectable); if (tr.text().contains("nur Ziffern")) field.setNumber(true); fields.add(field); } else if (tr.select("input[type=text]").size() == 2) { Element input1 = tr.select("input[type=text]").get(0); Element input2 = tr.select("input[type=text]").get(1); TextSearchField field1 = new TextSearchField(); field1.setId(input1.attr("name")); field1.setDisplayName(tr.select("span[id*=Lbl]").first().text()); field1.setData(notSelectable); if (tr.text().contains("nur Ziffern")) field1.setNumber(true); fields.add(field1); TextSearchField field2 = new TextSearchField(); field2.setId(input2.attr("name")); field2.setDisplayName(tr.select("span[id*=Lbl]").first().text()); field2.setData(notSelectable); field2.setHalfWidth(true); if (tr.text().contains("nur Ziffern")) field2.setNumber(true); fields.add(field2); } else if (tr.select("select").size() == 1) { Element select = tr.select("select").first(); DropdownSearchField dropdown = new DropdownSearchField(); dropdown.setId(select.attr("name")); dropdown.setDisplayName(tr.select("span[id*=Lbl]").first().text()); List<DropdownSearchField.Option> values = new ArrayList<>(); for (Element option : select.select("option")) { DropdownSearchField.Option opt = new DropdownSearchField.Option(option.val(), option.text()); values.add(opt); } dropdown.setDropdownValues(values); fields.add(dropdown); } else if (tr.select("input[type=checkbox]").size() == 1) { Element checkbox = tr.select("input[type=checkbox]").first(); CheckboxSearchField field = new CheckboxSearchField(); field.setId(checkbox.attr("name")); field.setDisplayName(tr.select("span[id*=Lbl]").first().text()); fields.add(field); } } } return fields; }