List of usage examples for org.jsoup.nodes Element absUrl
public String absUrl(String attributeKey)
From source file:org.apache.marmotta.ldclient.provider.phpbb.mapping.PHPBBTopicHrefMapper.java
/** * Take the selected value, process it according to the mapping definition, and create Sesame Values using the * factory passed as argument.//from ww w . ja va2 s . com * * @param resourceUri * @param selectedValue * @param factory * @return */ @Override public List<Value> map(String resourceUri, Element selectedValue, ValueFactory factory) { String baseUriSite = resourceUri.substring(0, resourceUri.lastIndexOf('/')); String baseUriTopic = baseUriSite + "/viewtopic.php?"; try { URI uri = new URI(selectedValue.absUrl("href")); Map<String, String> params = new HashMap<String, String>(); for (NameValuePair p : URLEncodedUtils.parse(uri, "UTF-8")) { params.put(p.getName(), p.getValue()); } return Collections.singletonList((Value) factory.createURI(baseUriTopic + "t=" + params.get("t"))); } catch (URISyntaxException ex) { throw new RuntimeException("invalid syntax for URI", ex); } }
From source file:eu.sisob.uma.extractors.adhoc.cvfilesinside.InternalCVFilesExtractor.java
/** * * @param input_file//from ww w .j a va2 s .co m * @param data_dir * @param output_file * @param error_sw */ public static void extract_cv_files(File input_file, File data_dir, File output_file/*, File output_file_2, File results_dir,*/, StringWriter error_sw) { CSVReader reader = null; try { reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); } int idStaffIdentifier = -1; int idName = -1; int idFirstName = -1; int idLastName = -1; int idInitials = -1; int idUnitOfAssessment_Description = -1; int idInstitutionName = -1; int idWebAddress = -1; int idResearchGroupDescription = -1; int idResearcherWebAddress = -1; int idResearcherWebAddressType = -1; int idResearcherWebAddressExt = -1; int idScoreUrl = -1; int idEmail = -1; int idScoreEmail = -1; String[] nextLine; try { if ((nextLine = reader.readNext()) != null) { //Locate indexes //Locate indexes for (int i = 0; i < nextLine.length; i++) { String column_name = nextLine[i]; if (column_name.equals(FileFormatConversor.CSV_COL_ID)) idStaffIdentifier = i; else if (column_name.equals(FileFormatConversor.CSV_COL_NAME)) idName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME)) idFirstName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME)) idLastName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS)) idInitials = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT)) idUnitOfAssessment_Description = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME)) idInstitutionName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL)) idWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL)) idResearcherWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE)) idResearcherWebAddressType = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT)) idResearcherWebAddressExt = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL)) idScoreUrl = i; else if (column_name.equals(FileFormatConversor.CSV_COL_EMAIL)) idEmail = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_EMAIL)) idScoreEmail = i; } } } catch (Exception ex) { String error_msg = "Error reading headers of " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } if (idResearcherWebAddress != -1 && idResearcherWebAddressType != -1 && idResearcherWebAddressExt != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) { if (true) { try { String header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_FIRSTNAME + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; if (idEmail != -1) header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"" + CSV_SEPARATOR; header += "\r\n"; FileUtils.write(output_file, header, "UTF-8", false); // DOWNLOAD HERE THE HOME PAGE //FileUtils.write(output_file_2, header, "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); error_sw.append("Error creating output files\r\n"); } } try { // DOWNLOAD HERE THE HOME PAGE // if(!results_dir.exists()) // results_dir.mkdirs(); // File homepage_results_dirs = new File(results_dir, "HOMEPAGE"); // if(!homepage_results_dirs.exists()) // homepage_results_dirs.mkdirs(); //if(!test_only_output) { Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+"); while ((nextLine = reader.readNext()) != null) { nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase(); if (idFirstName != -1) nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ") .toLowerCase(); if (idName != -1) nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); Document content = null; String researcher_page_url = nextLine[idResearcherWebAddress]; File temp_file = null; if (p1.matcher(researcher_page_url).matches()) { } else { try { Logger.getRootLogger().info("Reading " + researcher_page_url); temp_file = File.createTempFile("internal-cv-files-", ".tmp"); URL fetched_url = Downloader.fetchURL(researcher_page_url); FileUtils.copyURLToFile(fetched_url, temp_file); long sizeInBytes = temp_file.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { content = null; } else { String text_content = FileUtils.readFileToString(temp_file); String check_string = ""; if (text_content.length() <= 100) { check_string = text_content.substring(0, text_content.length()); } else { check_string = text_content.substring(0, 100); } if (check_string.toLowerCase().contains("html")) { content = Jsoup.parse(text_content); content.setBaseUri(researcher_page_url); // DOWNLOAD HERE THE HOME PAGE // String filename = nextLine[idStaffIdentifier] + "_HOMEPAGE_" + MD5(researcher_page_url) + ".html"; // FileUtils.copyFile(temp_file, new File(homepage_results_dirs, filename)); // // String result = ""; // result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; // if(idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; // if(idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; // if(idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; // if(idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; // if(idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; // result += "\"" + filename + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; // result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR; // if(idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; // result += "\r\n"; // // try { // FileUtils.write(output_file_2, result, "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } else { throw new Exception(researcher_page_url + " is not html document"); } } } catch (Exception ex) { Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex); error_sw.append("" + researcher_page_url + " could not loaded"); content = null; } catch (java.lang.OutOfMemoryError ex2) { Logger.getLogger("root") .error("" + researcher_page_url + " could not loaded (out of memory)", ex2); error_sw.append("" + researcher_page_url + " could not loaded (out of memory)"); content = null; } finally { if (temp_file != null) temp_file.delete(); } } //Add sources to output { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; result += "\"HOMEPAGE\"" + CSV_SEPARATOR; result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } if (content != null) { Elements links = content.select("a[href]"); Elements links_worepeat = new Elements(); for (Element link : links) { boolean b = false; for (Element link_worepeat : links_worepeat) { if (link.absUrl("href").equals(link_worepeat.absUrl("href"))) { b = true; break; } } if (!b) links_worepeat.add(link); } for (Element link : links_worepeat) { boolean b = false; link.setBaseUri(researcher_page_url); String clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase(); for (String k : cv_keywords_in_name_list) { if (clean_name_1.contains(k)) { b = true; break; } } if (b) { Logger.getRootLogger() .info("CV found " + link.absUrl("href") + " (" + link.text() + ")"); String href = link.absUrl("href"); String ext = ""; String score = ""; String type = "CV"; if (link.absUrl("href").endsWith(".pdf")) ext = "PDF"; else if (link.absUrl("href").endsWith(".doc")) ext = "DOC"; else if (link.absUrl("href").endsWith(".docx")) ext = "DOCX"; else if (link.absUrl("href").endsWith(".rtf")) ext = "RTF"; else if (link.absUrl("href").endsWith(".txt")) ext = "TXT"; else ext = "HTML"; if (ext.equals("HTML")) { score = "B"; } else { score = "A"; } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + ext + "\"" + CSV_SEPARATOR; result += "\"" + type + "\"" + CSV_SEPARATOR; result += "\"" + score + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } b = false; link.setBaseUri(researcher_page_url); clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase(); for (String k : pub_keywords_in_name_list) { if (clean_name_1.contains(k)) { b = true; break; } } if (b) { Logger.getRootLogger() .info("PUB found " + link.absUrl("href") + " (" + link.text() + ")"); String href = link.absUrl("href"); String ext = ""; String score = ""; String type = "PUB"; if (link.absUrl("href").endsWith(".pdf")) ext = "PDF"; else if (link.absUrl("href").endsWith(".doc")) ext = "DOC"; else if (link.absUrl("href").endsWith(".docx")) ext = "DOCX"; else if (link.absUrl("href").endsWith(".rtf")) ext = "RTF"; else if (link.absUrl("href").endsWith(".txt")) ext = "TXT"; else ext = "HTML"; if (ext.equals("HTML")) { score = "-"; } else { score = "-"; } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + ext + "\"" + CSV_SEPARATOR; result += "\"" + type + "\"" + CSV_SEPARATOR; result += "\"" + score + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } } } reader.close(); } // reader = null; // try { // reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR); // } catch (FileNotFoundException ex) { // Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); // } // // reader.readNext(); // // int newIdResearcherWebpage = 3; // if(idFirstName != -1) newIdResearcherWebpage++; // if(idName != -1) newIdResearcherWebpage++; // if(idEmail != -1) newIdResearcherWebpage++; // if(idInstitutionName != -1) newIdResearcherWebpage++; // if(idWebAddress != -1) newIdResearcherWebpage++; // // List<Object[]> urls_times = new ArrayList<Object[]>(); // while ((nextLine = reader.readNext()) != null) // { // String url = nextLine[newIdResearcherWebpage]; // // Object[] url_time = new Object[2]; // url_time[0] = url; // boolean b = false; // for(Object[] u : urls_times){ // if(u[0].equals(url_time[0])){ // u[1] = (Integer)u[1] + 1; // b = true; // break; // } // } // // if(!b){ // url_time[1] = new Integer(1); // urls_times.add(url_time); // } // } // // reader.close(); // try { // reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR); // } catch (FileNotFoundException ex) { // Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); // } // // nextLine = reader.readNext(); // try { // for(int i = 0; i < nextLine.length; i++) // nextLine[i] = "\"" + nextLine[i] + "\""; // FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", false); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } // // while ((nextLine = reader.readNext()) != null) // { // String url = nextLine[newIdResearcherWebpage]; // boolean b = false; // for(Object[] u : urls_times){ // if(u[0].equals(url) && ((Integer)u[1] == 1)){ // b = true; // break; // } // } // // if(b){ // try { // for(int i = 0; i < nextLine.length; i++) // nextLine[i] = "\"" + nextLine[i] + "\""; // FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } // } // } // // reader.close(); } catch (Exception ex) { String error_msg = "Error extracting cv files from extractor " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } } }
From source file:org.sbs.goodcrawler.fetcher.FetchWorker.java
/** * @param url/*from w ww .j a va 2 s .c o m*/ * @desc */ public void fetchPage(WebURL url) { PageFetchResult result = null; try { if (null != url && StringUtils.isNotBlank(url.getURL())) { // ?? if (fetchFilter(url.getURL())) { result = fetcher.fetchHeader(url); // ?? int statusCode = result.getStatusCode(); if (statusCode == CustomFetchStatus.PageTooBig) { onIgnored(url); return; } if (statusCode != HttpStatus.SC_OK) { onFailed(url); } else { Page page = new Page(url); pendingUrls.processedSuccess(); if (!result.fetchContent(page)) { onFailed(url); return; } if (!parser.parse(page, url.getURL())) { onFailed(url); return; } // ?? if (extractFilter(url.getURL())) { pendingPages.addElement(page); } // depth if (url.getDepth() > conf.getMaxDepthOfCrawling() && conf.getMaxDepthOfCrawling() != -1) { return; } // ???Url?Url Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); // ?? if (fetchFilter(linkHref) && !bloomfilterHelper.exist(linkHref)) { WebURL purl = new WebURL(); purl.setURL(linkHref); purl.setJobName(conf.jobName); purl.setDepth((short) (url.getDepth() + 1)); if (purl.getDepth() > conf.getMaxDepthOfCrawling() && conf.getMaxDepthOfCrawling() != -1) return; try { if (!pendingUrls.addElement(purl, 1000)) { FileUtils.writeStringToFile(new File("status/_urls.good"), url.getURL() + "\n", true); } } catch (QueueException e) { log.error(e.getMessage()); } } } } } } else { onIgnored(url); } } } catch (Exception e) { onFailed(url); } catch (QueueException e) { onFailed(url); } finally { if (null != result) result.discardContentIfNotConsumed(); } }
From source file:org.aliuge.crawler.fetcher.FetchWorker.java
/** * @param url/*from ww w . j av a 2s . c o m*/ * @desc */ public void fetchPage(WebURL url) { PageFetchResult result = null; try { if (null != url && StringUtils.isNotBlank(url.getUrl())) { result = fetcher.fetch(url, true); // ?? int statusCode = result.getStatusCode(); if (statusCode == CustomFetchStatus.PageTooBig) { onIgnored(url); return; } if (statusCode != HttpStatus.SC_OK) { onFailed(url); } else { Page page = new Page(url); onSuccessed(); if (!result.fetchContent(page)) { onFailed(url); return; } if (!parser.parse(page, url.getUrl())) { onFailed(url); return; } // ?? String e_url = extractFilterAndChangeUrl(url.getUrl()); if (StringUtils.isNoneBlank(e_url)) { url.setUrl(e_url); page.setWebURL(url); pendingPages.addElement(page); return; } // depth if (url.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) { return; } // ???Url?Url Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getUrl())); Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); // ???url if ((fetchFilter(linkHref) || extractFilter(linkHref)) && !bloomfilterHelper.exist(linkHref)) { WebURL purl = new WebURL(); purl.setName(link.text()); purl.setUrl(linkHref); purl.setDepth((short) (url.getDepth() + 1)); if (purl.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) return; try { if (!pendingUrls.addElement(purl)) { FileUtils.writeStringToFile(new File("status/_urls.good"), url.getUrl() + "\n", true); } } catch (QueueException e) { log.error(e.getMessage()); } } } } } } } catch (QueueException e) { onFailed(url); } catch (Exception e) { e.printStackTrace(); onFailed(url); } finally { if (null != result) result.discardContentIfNotConsumed(); } }
From source file:me.vertretungsplan.parser.UntisMonitorParser.java
private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl, int recursionDepth) throws IOException, CredentialInvalidException { String html;/*w w w . j av a 2s. co m*/ if (url.equals(VALUE_URL_LOGIN_RESPONSE)) { html = loginResponse; } else { try { html = httpGet(url, encoding).replace(" ", ""); } catch (HttpResponseException e) { if (docs.size() == 0) { throw e; } else { return; // ignore if first page was loaded and redirect didn't work } } } Document doc = Jsoup.parse(html); doc.setBaseUri(url); if (doc.select(".mon_title").size() == 0) { // We have a problem - there seems to be no substitution schedule. Maybe it is hiding // inside a frame? if (doc.select("frameset frame[name").size() > 0) { for (Element frame : doc.select("frameset frame")) { if (frame.attr("src").matches(".*subst_\\d\\d\\d.html?") && recursionDepth < MAX_RECURSION_DEPTH) { String frameUrl = frame.absUrl("src"); loadUrl(frame.absUrl("src"), encoding, following, docs, frameUrl, recursionDepth + 1); } } } else if (doc.text().contains("registriert")) { throw new CredentialInvalidException(); } else { if (docs.size() == 0) { // ignore if first page was loaded and redirect didn't work throw new IOException( "Could not find .mon-title, seems like there is no Untis " + "schedule here"); } } } else { findSubDocs(docs, html, doc); if (following && doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl) && recursionDepth < MAX_RECURSION_DEPTH) { loadUrl(redirectUrl, encoding, true, docs, startUrl, recursionDepth + 1); } } } }
From source file:ac.simons.oembed.Oembed.java
/** * Parses the given html document into a document and processes * all anchor elements. If a valid anchor is found, it tries to * get an oembed response for it's url and than render the result * into the document replacing the given anchor.<br> * It returns the html representation of the new document.<br> * If there's an error or no oembed result for an url, the anchor tag * will be left as it was. /*from w ww .j av a 2 s . co m*/ * @param document The document that should be checked for links to transform * @return the transformed document */ public Document transformDocument(final Document document) { boolean changedBaseUri = false; if (document.baseUri() == null && this.getBaseUri() != null) { document.setBaseUri(this.getBaseUri()); changedBaseUri = true; } for (Element a : document.getElementsByTag("a")) { final String href = a.absUrl("href"); try { String renderedRespose = null; final OembedResponse oembedResponse = this.transformUrl(href); // There was no response or an exception happened if (oembedResponse == null) continue; // There is a handler for this response else if (this.getHandler().containsKey(oembedResponse.getSource())) this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse); // Try to render the response itself and replace the current anchor else if ((renderedRespose = oembedResponse.render()) != null) { a.before(renderedRespose); a.remove(); } } catch (OembedException e) { logger.warn(String.format("Skipping '%s': %s", href, e.getMessage())); } } if (changedBaseUri) document.setBaseUri(null); return document; }
From source file:eu.sisob.uma.extractors.adhoc.websearchers.WebSearchersExtractor.java
/** * * @param nextLine/*from w ww . j ava2 s. com*/ * @param idStaffIdentifier * @param idName * @param idFirstName * @param idLastName * @param idInitials * @param idSubject * @param idInstitutionName * @param idWebAddress * @param expression * @param params * @return */ @Override protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName, int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress, String expression, Object[] params) { String keywords = " (PROFILE OR PHD OR RESEARCHER OR FACULTY OR PROFESSOR OR RESEARCH) AND "; keywords = ""; String domain = clean_site(nextLine[idWebAddress]); String subject = nextLine[idSubject]; String and_institution_name = (idInstitutionName != -1 ? " AND " + nextLine[idInstitutionName] : ""); String expression_subject = expression + " AND " + subject; String expression_site = expression + " site: " + domain; String expression_inst_name = expression + and_institution_name; String expression_inst_name_and_subject = expression + and_institution_name + " AND " + subject; String url = ""; switch (search_patterns) { case P1: url = "https://duckduckgo.com/html/?q=" + keywords + expression; break; case P2: url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject; break; case P3: url = "https://duckduckgo.com/html/?q=" + keywords + expression_site; break; case P4: url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name; break; case P5: url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name_and_subject; break; default: url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject; break; } Logger.getRootLogger().info("Go with " + url); boolean again = false; Document doc = null; do { doc = getDocumentFromPage(url, 10, 1000, 5000); if (doc != null && doc.text().contains("If this error persists, please let us know")) { try { Thread.sleep(30000); } catch (InterruptedException ex) { } again = true; } else { again = false; } } while (again); String final_result = ""; if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) { /* Write resercher founded */ Elements elements = doc.select("div[class*=links_main] > a"); /* We will take the first html page and the first pdf */ HashMap<String, String> results = new HashMap<String, String>(); int max_results = 2; int i_result = 0; for (Element e : elements) { if ((e.text().startsWith("[") //&& !e.text().startsWith("[PDF]") ) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("facebook.com") || e.absUrl("href").contains("microsoft.com") || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin") || e.absUrl("href").contains("www.biography.com") || e.absUrl("href").contains("biomedexperts.com") || e.absUrl("href").contains("www.experts.scival.com") || e.absUrl("href").contains("ratemyprofessors.com") || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt") || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml") || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx") || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs") || e.absUrl("href").contains("www.amazon")) { max_results++; continue; } boolean add = false; String score = ""; String ext = ""; if (!results.containsKey("HTML") && !e.text().startsWith("[")) { //results.put("html", ) File temp; try { temp = File.createTempFile("temp-file-name", ".tmp"); URL fetched_url = Downloader.fetchURL(e.absUrl("href")); FileUtils.copyURLToFile(fetched_url, temp); long sizeInBytes = temp.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { score = "B"; } else { String content = FileUtils.readFileToString(temp); if (content.contains(nextLine[idLastName])) { score = "A"; } else { score = "B"; } } } catch (IOException ex) { score = "B"; } ext = "HTML"; add = true; } //if(!results.containsKey("PDF") && e.text().startsWith("[PDF]")){ // score = "A"; // ext = "PDF"; // add = true; //} if (add) { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\";"; result += "\"" + nextLine[idLastName] + "\";"; result += "\"" + nextLine[idInitials] + "\";"; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\";"; if (idName != -1) result += "\"" + nextLine[idName] + "\";"; result += "\"" + e.absUrl("href") + "\";"; result += "\"" + ext + "\";"; result += "\"" + "CV" + "\";"; result += "\"" + score + "\""; result += "\r\n"; results.put(ext, result); Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + e.text()); } // if(results.containsKey("PDF") && results.containsKey("HTML")){ // break; // } i_result++; if (max_results <= i_result) { break; } } // if(results.containsKey("PDF")) // final_result = results.get("PDF"); // else if (results.containsKey("HTML")) final_result = results.get("HTML"); else final_result = ""; } return final_result; }
From source file:eu.sisob.uma.extractors.adhoc.websearchers_cv.WebSearchersCVExtractor.java
/** * * @param nextLine/*from w ww. j a v a2 s.c om*/ * @param idStaffIdentifier * @param idName * @param idFirstName * @param idLastName * @param idInitials * @param idSubject * @param idInstitutionName * @param idWebAddress * @param expression * @param params * @return */ @Override protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName, int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress, String expression, Object[] params) { String domain = clean_site(nextLine[idWebAddress]); String subject = nextLine[idSubject]; String expression_subject = expression + " " + subject + " " + files + " " + cv_keywords_in_query; expression_subject = expression_subject.replaceAll("\t", " "); expression_subject = expression_subject.replaceAll(" ", " "); String url = "https://duckduckgo.com/html/?q=" + expression_subject; Logger.getRootLogger().info("Go with " + url); boolean again = false; Document doc = null; do { doc = getDocumentFromPage(url, 10, 2000, 5000); if (doc != null && doc.text().contains("If this error persists, please let us know")) { try { Thread.sleep(30000); } catch (InterruptedException ex) { } again = true; } else { again = false; } } while (again); //if(doc.select("div[class*=links_main] > a[href*=" + domain + "]").size() > 0){ String final_result = ""; if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) { /* Write resercher founded */ Elements elements = doc.select("div[class*=links_main] > a"); /* We will take the first html page and the first pdf */ List<String[]> results = new ArrayList<String[]>(); final int EXT_I = 0; final int SCORE_INT_I = 1; final int SCORE_LETTER_I = 2; final int RESULT_I = 3; final int WORST_SCORE = 67; //int max_results = elements.size(); //int i_result = 0; for (Element e : elements) { if ((e.text().startsWith("[") && !e.text().startsWith("[PDF]")) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("microsoft.com") || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin") || e.absUrl("href").contains("www.biography.com") || e.absUrl("href").contains("biomedexperts.com") || e.absUrl("href").contains("www.experts.scival.com") || e.absUrl("href").contains("ratemyprofessors.com") || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt") || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml") || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx") || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs") || e.absUrl("href").contains("www.amazon")) { continue; } boolean add = false; int score_int = WORST_SCORE; String score = ""; String ext = ""; if (e.text().startsWith("[PDF]") || e.text().startsWith("[DOCX]") || e.text().startsWith("[DOC]") || e.text().startsWith("[RTF]")) { String clean_name_1 = e.text().replaceAll("[^\\w\\s]", "").toLowerCase(); int i = e.absUrl("href").lastIndexOf("/"); int f = e.absUrl("href").lastIndexOf("."); String clean_name_2 = ""; if (i != -1 && f != -1) clean_name_2 = e.absUrl("href").substring(i, f).toLowerCase(); boolean b = false; for (String k : cv_keywords_in_name_list) { if (clean_name_1.contains(k) || clean_name_2.contains(k)) { b = true; break; } } if (b) { score_int--; } if (clean_name_1.contains(nextLine[idLastName]) || clean_name_2.contains(nextLine[idLastName])) { score_int--; } score = Character.toChars(score_int)[0] + ""; add = true; ext = "PDF"; } //if(!results.containsKey("HTML") && !e.text().startsWith("[")){ //} if (add) { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\";"; result += "\"" + nextLine[idLastName] + "\";"; result += "\"" + nextLine[idInitials] + "\";"; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\";"; if (idName != -1) result += "\"" + nextLine[idName] + "\";"; result += "\"" + e.absUrl("href") + "\";"; result += "\"" + ext + "\";"; result += "\"" + "CV" + "\";"; result += "\"" + score + "\""; result += "\r\n"; results.add(new String[] { ext, score_int + "", score, result }); Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + score + " - " + e.text()); } } final_result = ""; int best_score = WORST_SCORE; for (String[] result : results) { if (result[EXT_I].equals("PDF")) { int act_score = Integer.parseInt(result[SCORE_INT_I]); if (act_score < best_score) { best_score = act_score; final_result = result[RESULT_I]; } } } } return final_result; }
From source file:com.aquest.emailmarketing.web.controllers.BroadcastTemplateController.java
/** * Define content./* w ww . j av a 2 s. c o m*/ * * @param model the model * @param broadcastTemplate1 the broadcast template1 * @param result the result * @param principal the principal * @return the string * @throws IOException */ @RequestMapping(value = "/defineBcastTemplateContent", method = RequestMethod.POST) public String defineContent(Model model, @Valid @ModelAttribute("broadcastTemplate") BroadcastTemplate broadcastTemplate1, @RequestParam(value = "fromUrl", required = false) String fromUrl, @RequestParam(value = "optimize", required = false) boolean optimize, @RequestParam(value = "baseurl", required = false) String baseUrl, @RequestParam(value = "rel2abs", required = false) boolean rel2abs, BindingResult result, Principal principal) throws IOException { String htmlBodyPrep = ""; BroadcastTemplate broadcastTemplate = broadcastTemplateService .getBroadcastTemplateById(broadcastTemplate1.getId()); broadcastTemplate.setB_template_subject(broadcastTemplate1.getB_template_subject()); if (fromUrl != "") { Document doc = Jsoup.connect(fromUrl).get(); htmlBodyPrep = doc.outerHtml(); broadcastTemplate.setHtmlbody(htmlBodyPrep); System.out.println(htmlBodyPrep); } if (broadcastTemplate1.getHtmlbody() != null) { htmlBodyPrep = broadcastTemplate1.getHtmlbody(); broadcastTemplate.setHtmlbody(htmlBodyPrep); } if (rel2abs == true) { if (baseUrl != null) { System.out.println(baseUrl); Document doc = Jsoup.parse(broadcastTemplate.getHtmlbody(), baseUrl); System.out.println(doc.toString()); Elements images = doc.select("img"); for (Element e : images) { e.attr("src", e.absUrl("src")); System.out.println(e.absUrl("src")); } broadcastTemplate.setHtmlbody(doc.outerHtml()); htmlBodyPrep = doc.outerHtml(); } else { // ovde staviti error handling } } if (optimize == true) { // /* PREMAILER API OPTIONS // * line_length - Line length used by to_plain_text. Boolean, default is 65. // warn_level - What level of CSS compatibility warnings to show (see Warnings). // NONE = 0 // SAFE = 1 // POOR = 2 // RISKY = 3 // link_query_string - A string to append to every a href="" link. Do not include the initial ?. // base_url - Used to calculate absolute URLs for local files. // css - Manually specify CSS stylesheets. // css_to_attributes - Copy related CSS attributes into HTML attributes (e.g. background-color to bgcolor) // css_string - Pass CSS as a string // remove_ids - Remove ID attributes whenever possible and convert IDs used as anchors to hashed to avoid collisions in webmail programs. Default is false. // remove_classes - Remove class attributes. Default is false. // remove_comments - Remove html comments. Default is false. // preserve_styles - Whether to preserve any link rel=stylesheet and style elements. Default is false. // preserve_reset - Whether to preserve styles associated with the MailChimp reset code // with_html_string - Whether the html param should be treated as a raw string. // verbose - Whether to print errors and warnings to $stderr. Default is false. // adapter - Which HTML parser to use, either :nokogiri or :hpricot. Default is :hpricot. // */ Premailer premailer = new Premailer(); PremailerInterface premailerInterface = premailer.getPremailerInstance(); Map<String, Object> options = new HashMap<String, Object>(); options.put("with_html_string", true); options.put("base_url", fromUrl); premailerInterface.init(broadcastTemplate.getHtmlbody(), options); //premailerInterface.init(htmlBodyPrep, options); broadcastTemplate.setHtmlbody(premailerInterface.inline_css()); System.out.println(premailerInterface.inline_css()); premailer.destroyInstance(); } broadcastTemplate.setPlaintext(broadcastTemplate1.getPlaintext()); System.out.println(broadcastTemplate.toString()); String bcast_id = broadcastTemplateService.SaveOrUpdate(broadcastTemplate); // Find URLs in html body and add tracking code Urls urls = new Urls(); String html = broadcastTemplate.getHtmlbody(); List<String> urlList = new ArrayList<String>(); Document doc = Jsoup.parse(html); Elements links = doc.select("a[href]"); for (Element link : links) { if (link.attr("abs:href").length() > 5) { urlList.add(link.attr("abs:href")); } } model.addAttribute("urlList", urlList); model.addAttribute("urls", urls); // Google Analytics - utmCampaign List List<String> utmCampaignList = new ArrayList<String>(); utmCampaignList.add("[BROADAST_NAME]"); model.addAttribute("utmCampaignList", utmCampaignList); // Google Analytics - utmSource List List<String> utmSourceList = new ArrayList<String>(); utmSourceList.add("[CAMPAIGN_NAME]"); model.addAttribute("utmSourceList", utmSourceList); // Google Analytics - utmContent List List<String> utmContentList = new ArrayList<String>(); utmContentList.add("[EMAIL]"); // ovde dodati sve varijabilne podatke iz CM_EMAIL_BROADCAST_LIST model.addAttribute("utmContentList", utmContentList); model.addAttribute("broadcastTemplate", broadcastTemplate); return "bcasttemptracking"; }
From source file:ac.simons.oembed.Oembed.java
private OembedProvider autodiscoverOembedURIForUrl(final String url) { OembedProvider rv = null;/*from w w w .ja v a 2 s . c o m*/ try { final HttpGet request = new HttpGet(url); final HttpResponse httpResponse = this.httpClient.execute(request); if (httpResponse.getStatusLine().getStatusCode() != HttpStatus.SC_OK) logger.warn(String.format("Autodiscovery for %s failed, server returned error %d: %s", url, httpResponse.getStatusLine().getStatusCode(), EntityUtils.toString(httpResponse.getEntity()))); else { final URI uri = request.getURI(); final Document document = Jsoup.parse(EntityUtils.toString(httpResponse.getEntity(), "UTF-8"), String.format("%s://%s:%d", uri.getScheme(), uri.getHost(), uri.getPort())); for (Element alternate : document.getElementsByAttributeValue("rel", "alternate")) { if (alternate.attr("type").equalsIgnoreCase("application/json+oembed")) rv = new AutodiscoveredOembedProvider(url, new URI(alternate.absUrl("href")), "json"); else if (alternate.attr("type").equalsIgnoreCase("text/xml+oembed")) rv = new AutodiscoveredOembedProvider(url, new URI(alternate.absUrl("href")), "xml"); if (rv != null) break; } } } catch (Exception e) { logger.warn(String.format("Autodiscovery for %s failedd: %s", url, e.getMessage()), e); } return rv; }