List of usage examples for java.io FileNotFoundException toString
public String toString()
From source file:eu.sisob.uma.extractors.adhoc.cvfilesinside.InternalCVFilesExtractor.java
/** * * @param input_file/*ww w . j a v a2 s . c o m*/ * @param data_dir * @param output_file * @param error_sw */ public static void extract_cv_files(File input_file, File data_dir, File output_file/*, File output_file_2, File results_dir,*/, StringWriter error_sw) { CSVReader reader = null; try { reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); } int idStaffIdentifier = -1; int idName = -1; int idFirstName = -1; int idLastName = -1; int idInitials = -1; int idUnitOfAssessment_Description = -1; int idInstitutionName = -1; int idWebAddress = -1; int idResearchGroupDescription = -1; int idResearcherWebAddress = -1; int idResearcherWebAddressType = -1; int idResearcherWebAddressExt = -1; int idScoreUrl = -1; int idEmail = -1; int idScoreEmail = -1; String[] nextLine; try { if ((nextLine = reader.readNext()) != null) { //Locate indexes //Locate indexes for (int i = 0; i < nextLine.length; i++) { String column_name = nextLine[i]; if (column_name.equals(FileFormatConversor.CSV_COL_ID)) idStaffIdentifier = i; else if (column_name.equals(FileFormatConversor.CSV_COL_NAME)) idName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME)) idFirstName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME)) idLastName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS)) idInitials = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT)) idUnitOfAssessment_Description = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME)) idInstitutionName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL)) idWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL)) idResearcherWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE)) idResearcherWebAddressType = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT)) idResearcherWebAddressExt = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL)) idScoreUrl = i; else if (column_name.equals(FileFormatConversor.CSV_COL_EMAIL)) idEmail = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_EMAIL)) idScoreEmail = i; } } } catch (Exception ex) { String error_msg = "Error reading headers of " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } if (idResearcherWebAddress != -1 && idResearcherWebAddressType != -1 && idResearcherWebAddressExt != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) { if (true) { try { String header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_FIRSTNAME + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; if (idEmail != -1) header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"" + CSV_SEPARATOR; header += "\r\n"; FileUtils.write(output_file, header, "UTF-8", false); // DOWNLOAD HERE THE HOME PAGE //FileUtils.write(output_file_2, header, "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); error_sw.append("Error creating output files\r\n"); } } try { // DOWNLOAD HERE THE HOME PAGE // if(!results_dir.exists()) // results_dir.mkdirs(); // File homepage_results_dirs = new File(results_dir, "HOMEPAGE"); // if(!homepage_results_dirs.exists()) // homepage_results_dirs.mkdirs(); //if(!test_only_output) { Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+"); while ((nextLine = reader.readNext()) != null) { nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase(); if (idFirstName != -1) nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ") .toLowerCase(); if (idName != -1) nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); Document content = null; String researcher_page_url = nextLine[idResearcherWebAddress]; File temp_file = null; if (p1.matcher(researcher_page_url).matches()) { } else { try { Logger.getRootLogger().info("Reading " + researcher_page_url); temp_file = File.createTempFile("internal-cv-files-", ".tmp"); URL fetched_url = Downloader.fetchURL(researcher_page_url); FileUtils.copyURLToFile(fetched_url, temp_file); long sizeInBytes = temp_file.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { content = null; } else { String text_content = FileUtils.readFileToString(temp_file); String check_string = ""; if (text_content.length() <= 100) { check_string = text_content.substring(0, text_content.length()); } else { check_string = text_content.substring(0, 100); } if (check_string.toLowerCase().contains("html")) { content = Jsoup.parse(text_content); content.setBaseUri(researcher_page_url); // DOWNLOAD HERE THE HOME PAGE // String filename = nextLine[idStaffIdentifier] + "_HOMEPAGE_" + MD5(researcher_page_url) + ".html"; // FileUtils.copyFile(temp_file, new File(homepage_results_dirs, filename)); // // String result = ""; // result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; // if(idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; // if(idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; // if(idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; // if(idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; // if(idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; // result += "\"" + filename + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; // result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; // result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR; // if(idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; // result += "\r\n"; // // try { // FileUtils.write(output_file_2, result, "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } else { throw new Exception(researcher_page_url + " is not html document"); } } } catch (Exception ex) { Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex); error_sw.append("" + researcher_page_url + " could not loaded"); content = null; } catch (java.lang.OutOfMemoryError ex2) { Logger.getLogger("root") .error("" + researcher_page_url + " could not loaded (out of memory)", ex2); error_sw.append("" + researcher_page_url + " could not loaded (out of memory)"); content = null; } finally { if (temp_file != null) temp_file.delete(); } } //Add sources to output { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; result += "\"HOMEPAGE\"" + CSV_SEPARATOR; result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } if (content != null) { Elements links = content.select("a[href]"); Elements links_worepeat = new Elements(); for (Element link : links) { boolean b = false; for (Element link_worepeat : links_worepeat) { if (link.absUrl("href").equals(link_worepeat.absUrl("href"))) { b = true; break; } } if (!b) links_worepeat.add(link); } for (Element link : links_worepeat) { boolean b = false; link.setBaseUri(researcher_page_url); String clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase(); for (String k : cv_keywords_in_name_list) { if (clean_name_1.contains(k)) { b = true; break; } } if (b) { Logger.getRootLogger() .info("CV found " + link.absUrl("href") + " (" + link.text() + ")"); String href = link.absUrl("href"); String ext = ""; String score = ""; String type = "CV"; if (link.absUrl("href").endsWith(".pdf")) ext = "PDF"; else if (link.absUrl("href").endsWith(".doc")) ext = "DOC"; else if (link.absUrl("href").endsWith(".docx")) ext = "DOCX"; else if (link.absUrl("href").endsWith(".rtf")) ext = "RTF"; else if (link.absUrl("href").endsWith(".txt")) ext = "TXT"; else ext = "HTML"; if (ext.equals("HTML")) { score = "B"; } else { score = "A"; } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + ext + "\"" + CSV_SEPARATOR; result += "\"" + type + "\"" + CSV_SEPARATOR; result += "\"" + score + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } b = false; link.setBaseUri(researcher_page_url); clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase(); for (String k : pub_keywords_in_name_list) { if (clean_name_1.contains(k)) { b = true; break; } } if (b) { Logger.getRootLogger() .info("PUB found " + link.absUrl("href") + " (" + link.text() + ")"); String href = link.absUrl("href"); String ext = ""; String score = ""; String type = "PUB"; if (link.absUrl("href").endsWith(".pdf")) ext = "PDF"; else if (link.absUrl("href").endsWith(".doc")) ext = "DOC"; else if (link.absUrl("href").endsWith(".docx")) ext = "DOCX"; else if (link.absUrl("href").endsWith(".rtf")) ext = "RTF"; else if (link.absUrl("href").endsWith(".txt")) ext = "TXT"; else ext = "HTML"; if (ext.equals("HTML")) { score = "-"; } else { score = "-"; } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + href + "\"" + CSV_SEPARATOR; result += "\"" + ext + "\"" + CSV_SEPARATOR; result += "\"" + type + "\"" + CSV_SEPARATOR; result += "\"" + score + "\"" + CSV_SEPARATOR; if (idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } } } reader.close(); } // reader = null; // try { // reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR); // } catch (FileNotFoundException ex) { // Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); // } // // reader.readNext(); // // int newIdResearcherWebpage = 3; // if(idFirstName != -1) newIdResearcherWebpage++; // if(idName != -1) newIdResearcherWebpage++; // if(idEmail != -1) newIdResearcherWebpage++; // if(idInstitutionName != -1) newIdResearcherWebpage++; // if(idWebAddress != -1) newIdResearcherWebpage++; // // List<Object[]> urls_times = new ArrayList<Object[]>(); // while ((nextLine = reader.readNext()) != null) // { // String url = nextLine[newIdResearcherWebpage]; // // Object[] url_time = new Object[2]; // url_time[0] = url; // boolean b = false; // for(Object[] u : urls_times){ // if(u[0].equals(url_time[0])){ // u[1] = (Integer)u[1] + 1; // b = true; // break; // } // } // // if(!b){ // url_time[1] = new Integer(1); // urls_times.add(url_time); // } // } // // reader.close(); // try { // reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR); // } catch (FileNotFoundException ex) { // Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); // } // // nextLine = reader.readNext(); // try { // for(int i = 0; i < nextLine.length; i++) // nextLine[i] = "\"" + nextLine[i] + "\""; // FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", false); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } // // while ((nextLine = reader.readNext()) != null) // { // String url = nextLine[newIdResearcherWebpage]; // boolean b = false; // for(Object[] u : urls_times){ // if(u[0].equals(url) && ((Integer)u[1] == 1)){ // b = true; // break; // } // } // // if(b){ // try { // for(int i = 0; i < nextLine.length; i++) // nextLine[i] = "\"" + nextLine[i] + "\""; // FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } // } // } // // reader.close(); } catch (Exception ex) { String error_msg = "Error extracting cv files from extractor " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } } }
From source file:org.iavante.sling.s3backend.S3BackendTestIT.java
protected void setUp() { Map<String, String> envs = System.getenv(); Set<String> keys = envs.keySet(); Iterator<String> it = keys.iterator(); boolean hashost = false; while (it.hasNext()) { String key = (String) it.next(); if (key.compareTo(HOSTVAR) == 0) { SLING_URL = SLING_URL + (String) envs.get(key); hashost = true;/*from ww w. j a va 2 s.c o m*/ } } if (hashost == false) SLING_URL = SLING_URL + HOSTPREDEF; client = new HttpClient(); title = "Test case content"; schema = "default"; slug = "test_case_contents3"; authPrefs.add(AuthPolicy.DIGEST); authPrefs.add(AuthPolicy.BASIC); defaultcreds = new UsernamePasswordCredentials("admin", "admin"); client.getParams().setAuthenticationPreemptive(true); client.getState().setCredentials(AuthScope.ANY, defaultcreds); client.getParams().setParameter(AuthPolicy.AUTH_SCHEME_PRIORITY, authPrefs); createTest3(); try { uploadContent(); } catch (FileNotFoundException e) { log.info("ERROR uploading File: " + e.toString()); e.printStackTrace(); } createContent(); }
From source file:eu.sisob.uma.extractors.adhoc.email.EmailExtractor.java
/** * * @param input_file// www .j a v a 2s .com * @param data_dir * @param output_file * @param norepeat_output_file * @param notfound_output_file * @param notfound_norepeat_output_file * @param filters * @param error_sw */ public static void extract_emails(File input_file, File data_dir, File output_file, File norepeat_output_file, File notfound_output_file, File notfound_norepeat_output_file, List<String> filters, StringWriter error_sw) { CSVReader reader = null; try { reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString()); } int idStaffIdentifier = -1; int idName = -1; int idFirstName = -1; int idLastName = -1; int idInitials = -1; int idUnitOfAssessment_Description = -1; int idInstitutionName = -1; int idWebAddress = -1; int idResearchGroupDescription = -1; int idResearcherWebAddress = -1; int idResearcherWebAddressType = -1; int idResearcherWebAddressExt = -1; int idScoreUrl = -1; String filter_literal = "("; for (String filter : filters) { filter_literal += filter + ","; } filter_literal += ")"; String[] nextLine; try { if ((nextLine = reader.readNext()) != null) { //Locate indexes //Locate indexes for (int i = 0; i < nextLine.length; i++) { String column_name = nextLine[i]; if (column_name.equals(FileFormatConversor.CSV_COL_ID)) idStaffIdentifier = i; else if (column_name.equals(FileFormatConversor.CSV_COL_NAME)) idName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME)) idFirstName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME)) idLastName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS)) idInitials = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT)) idUnitOfAssessment_Description = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME)) idInstitutionName = i; else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL)) idWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL)) idResearcherWebAddress = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE)) idResearcherWebAddressType = i; else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT)) idResearcherWebAddressExt = i; else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL)) idScoreUrl = i; } } } catch (Exception ex) { String error_msg = "Error reading headers of " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } if (idResearcherWebAddress != -1 && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) { //if(!test_only_output) { try { String header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\""; header += "\r\n"; FileUtils.write(output_file, header, "UTF-8", false); header = ""; header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idFirstName != -1) header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR; if (idName != -1) header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR; header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\""; header += "\r\n"; FileUtils.write(notfound_output_file, header, "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); error_sw.append("Error creating output files\r\n"); } } try { //if(!test_only_output) { Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+"); while ((nextLine = reader.readNext()) != null) { nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase(); if (idFirstName != -1) nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ") .toLowerCase(); if (idName != -1) nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase(); String content = ""; String researcher_page_url = nextLine[idResearcherWebAddress]; Logger.getLogger("root").info("Go with " + researcher_page_url); if (p1.matcher(researcher_page_url).matches()) { File f = new File(data_dir, researcher_page_url); if (researcher_page_url.endsWith(".doc") || researcher_page_url.endsWith(".docx")) { Logger.getLogger("root") .error("The document " + researcher_page_url + " could not loaded"); error_sw.append("The document " + researcher_page_url + " could not loaded"); } else if (researcher_page_url.endsWith(".pdf")) { PDFParser parser = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; try { parser = new PDFParser(new FileInputStream(f)); } catch (IOException e) { Logger.getLogger("root").error(e.toString()); error_sw.append("Unable to open PDF called " + researcher_page_url); } if (parser != null) { try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(2); content = pdfStripper.getText(pdDoc); } catch (Exception e) { Logger.getLogger("root").error(e.toString()); error_sw.append("An exception occured in parsing the PDF Document."); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { Logger.getLogger("root").error(e.toString()); } } } } } else { try { Logger.getRootLogger().info("Reading " + researcher_page_url); File temp; temp = File.createTempFile("temp-file-name", ".tmp"); URL fetched_url = Downloader.fetchURL(researcher_page_url); FileUtils.copyURLToFile(fetched_url, temp); long sizeInBytes = temp.length(); long sizeInMb = sizeInBytes / (1024 * 1024); if (sizeInMb > 100) { content = ""; } else { content = FileUtils.readFileToString(temp); temp.delete(); } } catch (Exception ex) { Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex); error_sw.append("" + researcher_page_url + " could not loaded"); content = ""; } catch (java.lang.OutOfMemoryError ex2) { Logger.getLogger("root").error( researcher_page_url + " could not loaded (Jsoup OutOfMemoryError)", ex2); error_sw.append("" + researcher_page_url + " could not loaded"); content = ""; } } if (!content.equals("")) { //final String RE_MAIL = "([\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Za-z]{2,4})"; final String RE_MAIL = "([\\w\\-]([\\.\\w]){1,16}[\\w]{1,16}@([\\w\\-]{1,16}\\.){1,16}[A-Za-z]{2,4})"; Pattern p = Pattern.compile(RE_MAIL); Matcher m = p.matcher(content); List<String> emails = new ArrayList<String>(); while (m.find()) { String email = m.group(1); if (!emails.contains(email)) { // Apply filter boolean pass = true; if (filters.size() > 0) { pass = false; for (String filter : filters) { String filter2 = filter.replace("*", ".*?"); Pattern pattern = Pattern.compile(filter2); if (pattern.matcher(email).matches()) { pass = true; break; } else { } } } if (pass) { Logger.getRootLogger().info(researcher_page_url + " => " + email + " PASS FILTER! " + filter_literal); emails.add(email); } else { Logger.getRootLogger().info(researcher_page_url + " => " + email + " REFUSE BY FILTER! " + filter_literal); } } } if (emails.size() < MAX_MAIL_PER_PAGE) { for (String email : emails) { String score_email = ""; String lastname = nextLine[idLastName]; if (lastname.length() > 5) lastname = lastname.substring(0, 6); if (email.toLowerCase().contains(lastname)) { score_email = "A"; } else { int temp_id = idFirstName; if (temp_id == -1) temp_id = idInitials; if (!nextLine[idInitials].trim().equals("")) { String firstname = nextLine[temp_id].split(" ")[0]; if (firstname.length() > 5) firstname = firstname.substring(0, 5); if (firstname.length() > 1) { if (email.toLowerCase().contains(firstname)) { score_email = "A"; } } } if (score_email.equals("")) { String initials = ""; String[] arr = nextLine[temp_id].split(" "); for (int i = 0; i < arr.length; i++) { if (arr[i].length() > 0) initials += arr[i].charAt(0); } initials += nextLine[idLastName].charAt(0); if (email.toLowerCase().contains(initials)) { score_email = "B"; } else { score_email = "Z"; } } } String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; result += "\"" + email + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) result += "\"" + nextLine[idScoreUrl] + "\"" + CSV_SEPARATOR; result += "\"" + score_email + "\""; result += "\r\n"; try { FileUtils.write(output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } else { content = ""; } if (emails.size() == 0) content = ""; } if (content == "") { String result = ""; result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR; if (idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR; if (idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR; if (idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR; if (idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR; result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressExt != -1) result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR; if (idResearcherWebAddressType != -1) result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR; if (idScoreUrl != -1) result += "\"" + nextLine[idScoreUrl] + "\""; result += "\r\n"; try { FileUtils.write(notfound_output_file, result, "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } reader.close(); } Logger.getLogger("root").info("Applying deduplication algoritm - Counting duplications"); boolean finish = false; String alternate_filename_1 = "file1"; String alternate_filename_2 = "file2"; File alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1); File alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2); FileUtils.copyFile(output_file, alternate_file_s); //FileUtils.write(output_file_wor_notfound, "", "UTF-8", false); FileUtils.write(norepeat_output_file, "", "UTF-8", false); while (!finish) { reader = null; try { reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger() .error("Error reading " + input_file.getName() + " - " + ex.toString()); } HashMap<String, Integer> count_dictionary = new HashMap<String, Integer>(); int idEmail = 3; if (idFirstName != -1) idEmail++; if (idName != -1) idEmail++; try { FileUtils.write(alternate_file_d, "", "UTF-8", false); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } finish = true; while ((nextLine = reader.readNext()) != null) { Integer count = 1; if (count_dictionary.containsKey(nextLine[idEmail].toString())) count = count_dictionary.get(nextLine[idEmail].toString()); else { if (count_dictionary.size() < max_in_mem) { count_dictionary.put(nextLine[idEmail].toString(), count + 1); } else { try { for (int i = 0; i < nextLine.length; i++) nextLine[i] = "\"" + nextLine[i] + "\""; FileUtils.write(alternate_file_d, StringUtil.join(Arrays.asList(nextLine), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); finish = false; } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } } } reader.close(); Logger.getLogger("root").info("Applying deduplication algoritm - Removing duplications"); reader = null; try { reader = new CSVReader(new FileReader(alternate_file_s), CSV_SEPARATOR); } catch (FileNotFoundException ex) { Logger.getRootLogger() .error("Error reading " + input_file.getName() + " - " + ex.toString()); } String previous_id = "%previous%"; String previous_email = "%previous_email%"; List<String[]> cache = new ArrayList<String[]>(); while ((nextLine = reader.readNext()) != null) { String id = nextLine[idStaffIdentifier].toString(); if (previous_id.equals(id)) { cache.add(nextLine); previous_id = id; } else { //Process String[] winner_line = null; String max_score = "Z"; for (String[] act_line : cache) { String act_score = "Z"; try { act_score = act_line[act_line.length - 1]; } catch (Exception ex) { } String email = act_line[idEmail].toString(); if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) { if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) { winner_line = act_line; max_score = act_score; } count_dictionary.put(email, 0); } } if (winner_line != null) { try { for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\""; FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } else { // try { // FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } cache.clear(); cache.add(nextLine); previous_id = id; } } //Process if (cache.size() > 0) { String[] winner_line = null; String max_score = "Z"; for (String[] act_line : cache) { String act_score = "Z"; try { act_score = (act_line[act_line.length - 1]); } catch (Exception ex) { } String email = act_line[idEmail]; if (count_dictionary.containsKey(email) && count_dictionary.get(email) > 0) { if (max_score.compareTo(act_score) > 0 && !act_score.equals("")) { winner_line = act_line; max_score = act_score; } count_dictionary.put(email, 0); } } if (winner_line != null) { try { for (int i = 0; i < winner_line.length; i++) winner_line[i] = "\"" + winner_line[i] + "\""; FileUtils.write(norepeat_output_file, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); } catch (IOException ex) { Logger.getLogger("root").error(ex.toString()); } } else { // try { // FileUtils.write(output_file_wor_notfound, StringUtil.join(Arrays.asList(winner_line), String.valueOf(CSV_SEPARATOR)) + "\r\n", "UTF-8", true); // } catch (IOException ex) { // Logger.getLogger("root").error(ex.toString()); // } } } reader.close(); // if (!finish) { FileUtils.copyFile(alternate_file_d, alternate_file_s); alternate_file_s = new File(output_file.getParentFile(), alternate_filename_1); alternate_file_d = new File(output_file.getParentFile(), alternate_filename_2); } } FileUtils.forceDelete(alternate_file_s); FileUtils.forceDelete(alternate_file_d); Logger.getLogger("root").info("Applying deduplication algoritm - Finish"); } catch (Exception ex) { String error_msg = "Error extracting emails from extractor " + input_file.getName(); Logger.getRootLogger().error(error_msg + " - " + ex.toString()); if (error_sw != null) error_sw.append(error_msg + "\r\n"); return; } } }
From source file:fr.free.coup2lapan.HistoricActivity.java
@Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_historic_main); // Show the Up button in the action bar. setupActionBar();//from w w w . j ava 2s. c om logHistoric = (TextView) findViewById(R.id.historic_log); final Button refreshbutton = (Button) findViewById(R.id.historic_refresh); refreshbutton.setOnClickListener(new View.OnClickListener() { public void onClick(View v) { // TODO read the log file and print XX last entries /*FileReader logFR = null; File batteryLogFile = new File(getApplicationContext().getFilesDir(), getString(R.string.fileName)); try { logFR = new FileReader(batteryLogFile); @SuppressWarnings("resource") BufferedReader logBR = new BufferedReader(logFR); String line = logBR.readLine(); while (null != line) { logHistoric.append(line); logHistoric.append("\n"); line = logBR.readLine(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { if (null != logFR) { try { logFR.close(); } catch (IOException e) { // ignore } } }*/ try { InputStream in = openFileInput(getString(R.string.fileName)); if (in != null) { InputStreamReader tmp = new InputStreamReader(in); BufferedReader reader = new BufferedReader(tmp); String str; StringBuilder buf = new StringBuilder(); while ((str = reader.readLine()) != null) { buf.append(str + "\n"); } in.close(); logHistoric.setText(buf.toString()); } } catch (java.io.FileNotFoundException e) { Toast.makeText(getApplicationContext(), "Exception: " + e.toString(), Toast.LENGTH_LONG).show(); } catch (Throwable t) { Toast.makeText(getApplicationContext(), "Exception: " + t.toString(), Toast.LENGTH_LONG).show(); } //logHistoric.setText("TODO : Lire fichier de mesures"); } }); final Button cleanbutton = (Button) findViewById(R.id.historic_clean); cleanbutton.setOnClickListener(new View.OnClickListener() { public void onClick(View v) { logHistoric.setText(""); } }); }
From source file:org.montanafoodhub.base.get.BuyerHub.java
protected HashMap<String, Buyer> readFromFile(Context context) { HashMap<String, Buyer> myBuyerMap = new HashMap<String, Buyer>(); try {//from w w w . ja v a 2s . c o m // getItem the time the file was last changed here File myFile = new File(context.getFilesDir() + "/" + fileName); SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss"); String lastRefreshTSStr = sdf.format(myFile.lastModified()); Log.w(HubInit.logTag, "Using file (" + fileName + ") last modified on : " + lastRefreshTSStr); lastRefreshTS = sdf.getCalendar(); // create products from the file here InputStream inputStream = context.openFileInput(fileName); if (inputStream != null) { parseCSV(myBuyerMap, inputStream); } inputStream.close(); } catch (FileNotFoundException e) { Log.e(HubInit.logTag, "File (" + fileName + ") not found: " + e.toString()); } catch (IOException e) { Log.e(HubInit.logTag, "Can not read file (" + fileName + ") : " + e.toString()); } Log.w(HubInit.logTag, "Number of buyers loaded: " + myBuyerMap.size()); return myBuyerMap; }
From source file:org.montanafoodhub.base.get.ProducerHub.java
protected HashMap<String, Producer> readFromFile(Context context) { HashMap<String, Producer> myProducerMap = new HashMap<String, Producer>(); try {/*from w w w . j av a2 s.co m*/ // getItem the time the file was last changed here File myFile = new File(context.getFilesDir() + "/" + fileName); SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss"); String lastRefreshTSStr = sdf.format(myFile.lastModified()); Log.w(HubInit.logTag, "Using file (" + fileName + ") last modified on : " + lastRefreshTSStr); lastRefreshTS = sdf.getCalendar(); // create products from the file here InputStream inputStream = context.openFileInput(fileName); if (inputStream != null) { parseCSV(myProducerMap, inputStream); } inputStream.close(); } catch (FileNotFoundException e) { Log.e(HubInit.logTag, "File (" + fileName + ") not found: " + e.toString()); } catch (IOException e) { Log.e(HubInit.logTag, "Can not read file (" + fileName + ") : " + e.toString()); } Log.w(HubInit.logTag, "Number of producers loaded: " + myProducerMap.size()); return myProducerMap; }
From source file:org.montanafoodhub.base.get.InitHub.java
protected void readFromFile(Context context) { try {//from w w w . j a v a2 s.c o m // getItem the time the file was last changed here File myFile = new File(context.getFilesDir() + "/" + fileName); SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss"); String lastRefreshTSStr = sdf.format(myFile.lastModified()); Log.w(HubInit.logTag, "Using file (" + fileName + ") last modified on : " + lastRefreshTSStr); lastRefreshTS = sdf.getCalendar(); // *** last line in the init wins!!! InputStream inputStream = context.openFileInput(fileName); if (inputStream != null) { parseCSV(inputStream); } inputStream.close(); } catch (FileNotFoundException e) { Log.e(HubInit.logTag, "File (" + fileName + ") not found: " + e.toString()); } catch (IOException e) { Log.e(HubInit.logTag, "Can not read file (" + fileName + ") : " + e.toString()); } Log.w(HubInit.logTag, "Hub name initialized to: " + HubInit.getHubName()); }
From source file:org.ounl.lifelonglearninghub.nfcecology.fcube.MainCubeActivity.java
private Properties getJukeboxFromFile() { Properties props = new Properties(); try {/* w w w. j a va2 s . c o m*/ InputStream inputStream = openFileInput(Constants.JUKEBOX_PROPERTIES_FILE); if (inputStream != null) { props.load(inputStream); } } catch (FileNotFoundException e) { Log.e(CLASSNAME, "File not found: " + e.toString()); } catch (IOException e) { Log.e(CLASSNAME, "Can not read file: " + e.toString()); } return props; }
From source file:stargate.drivers.transport.http.HTTPTransportServlet.java
@GET @Path(HTTPTransportRestfulConstants.RESTFUL_METADATA_PATH) @Produces(MediaType.APPLICATION_JSON)/*from w w w .java2 s. c o m*/ public Response getDataObjectMetadataRestful(@DefaultValue("") @QueryParam("path") String path) { try { RestfulResponse<DataObjectMetadata> rres = new RestfulResponse<DataObjectMetadata>( getDataObjectMetadata(path)); return Response.status(Response.Status.OK).entity(rres).build(); } catch (FileNotFoundException ex) { return Response.status(Response.Status.NOT_FOUND).entity(ex.toString()).build(); } catch (Exception ex) { RestfulResponse<DataObjectMetadata> rres = new RestfulResponse<DataObjectMetadata>(ex); return Response.status(Response.Status.INTERNAL_SERVER_ERROR).entity(rres).build(); } }
From source file:org.kuali.kfs.module.tem.batch.service.impl.TemProfileExportServiceImpl.java
public void exportProfile() { List<TemProfile> profiles = temProfileService.getAllActiveTemProfile(); //Accessing EXPORT_FILE_FORMAT sys param for export file extension LOG.info("Accessing EXPORT_FILE_FORMAT system parameter for file extension"); String extension = parameterService.getParameterValueAsString(TemProfileExportStep.class, TemConstants.TemProfileParameters.EXPORT_FILE_FORMAT); //Creating export file name String exportFile = fileDirectoryName + File.separator + fileName + "." + extension; //Initializing the output stream PrintStream OUTPUT_GLE_FILE_ps = null; try {//from w ww .ja v a 2 s .c om OUTPUT_GLE_FILE_ps = new PrintStream(exportFile); } catch (FileNotFoundException ex) { throw new RuntimeException(ex.toString(), ex); } //Create file based on extension if (extension.equalsIgnoreCase("xml")) { try { OUTPUT_GLE_FILE_ps.printf("%s\n", generateXMLDoc(profiles)); } catch (ParserConfigurationException ex) { throw new RuntimeException(ex.toString(), ex); } catch (TransformerException ex) { throw new RuntimeException(ex.toString(), ex); } } else { OUTPUT_GLE_FILE_ps.printf("%s\n", dateTimeService.toDateTimeString(dateTimeService.getCurrentDate()) + "," + parameterService.getParameterValueAsString( KfsParameterConstants.FINANCIAL_SYSTEM_ALL.class, KfsParameterConstants.INSTITUTION_NAME)); for (TemProfile profile : profiles) { try { OUTPUT_GLE_FILE_ps.printf("%s\n", generateCSVEntry(profile)); } catch (Exception e) { throw new RuntimeException(e.toString(), e); } } } OUTPUT_GLE_FILE_ps.close(); }