List of usage examples for org.apache.commons.csv CSVFormat newFormat
public static CSVFormat newFormat(final char delimiter)
From source file:com.datascience.cascading.CsvSchemeTest.java
/** * Tests the CSV scheme source with detected headers. */// w w w . j a v a 2 s.c o m private void testCsvSourceDetectHeaders(String inputPath) throws Exception { String sinkPath = "src/test/resources/output/source-detect-headers"; String expectedPath = "src/test/resources/expected/with-headers.txt"; CSVFormat sourceFormat = CSVFormat.newFormat(',').withQuote('"').withSkipHeaderRecord().withEscape('\\') .withRecordSeparator('\n'); CSVFormat sinkFormat = CSVFormat.newFormat('\t').withEscape('\\').withRecordSeparator('\n'); testScheme(inputPath, sourceFormat, sinkPath, sinkFormat, expectedPath, true); }
From source file:ai.grakn.migration.csv.CSVMigrator.java
/** * Each String in the stream is a CSV file * @return stream of parsed insert queries *///from w w w. ja va 2 s. c om public Stream<Map<String, Object>> convert() { try { CSVParser csvParser = CSVFormat.newFormat(separator).withIgnoreEmptyLines().withEscape('\\') .withFirstRecordAsHeader().withQuote(quote).withNullString(nullString).parse(reader); return stream(csvParser.iterator()).map(this::parse); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:com.datascience.hadoop.CsvInputFormat.java
/** * Creates a CSV format from a Hadoop configuration. *//*from w ww . j a va 2s . c o m*/ private static CSVFormat createFormat(Configuration conf) { CSVFormat format = CSVFormat .newFormat(conf.get(CSV_READER_DELIMITER, DEFAULT_CSV_READER_DELIMITER).charAt(0)) .withSkipHeaderRecord(conf.getBoolean(CSV_READER_SKIP_HEADER, DEFAULT_CSV_READER_SKIP_HEADER)) .withRecordSeparator(conf.get(CSV_READER_RECORD_SEPARATOR, DEFAULT_CSV_READER_RECORD_SEPARATOR)) .withIgnoreEmptyLines( conf.getBoolean(CSV_READER_IGNORE_EMPTY_LINES, DEFAULT_CSV_READER_IGNORE_EMPTY_LINES)) .withIgnoreSurroundingSpaces(conf.getBoolean(CSV_READER_IGNORE_SURROUNDING_SPACES, DEFAULT_CSV_READER_IGNORE_SURROUNDING_SPACES)) .withNullString(conf.get(CSV_READER_NULL_STRING, DEFAULT_CSV_READER_NULL_STRING)); String[] header = conf.getStrings(CSV_READER_COLUMNS); if (header != null && header.length > 0) format = format.withHeader(header); String escape = conf.get(CSV_READER_ESCAPE_CHARACTER, DEFAULT_CSV_READER_ESCAPE_CHARACTER); if (escape != null) format = format.withEscape(escape.charAt(0)); String quote = conf.get(CSV_READER_QUOTE_CHARACTER, DEFAULT_CSV_READER_QUOTE_CHARACTER); if (quote != null) format = format.withQuote(quote.charAt(0)); String quoteMode = conf.get(CSV_READER_QUOTE_MODE, DEFAULT_CSV_READER_QUOTE_MODE); if (quoteMode != null) format = format.withQuoteMode(QuoteMode.valueOf(quoteMode)); return format; }
From source file:com.datascience.cascading.CsvSchemeTest.java
/** * Tests the CSV scheme source with generated headers. */// w w w. j a v a 2 s . co m @Test public void testCsvSourceGenerateHeaders() throws Exception { String sourcePath = "src/test/resources/input/without-headers.txt"; String sinkPath = "src/test/resources/output/source-generate-headers"; String expectedPath = "src/test/resources/expected/with-generated-headers.txt"; CSVFormat sourceFormat = CSVFormat.newFormat(',').withQuote('"').withEscape('\\').withRecordSeparator('\n'); CSVFormat sinkFormat = CSVFormat.newFormat('\t').withEscape('\\').withRecordSeparator('\n'); testScheme(sourcePath, sourceFormat, sinkPath, sinkFormat, expectedPath, true); }
From source file:com.datascience.cascading.CsvSchemeTest.java
/** * Tests that strict parsing fails on a bad CSV source. *//*from w ww . j a v a 2 s. c o m*/ @Test(expected = RuntimeException.class) public void testBadCsvSourceStrict() throws Exception { String sourcePath = "src/test/resources/input/bad-without-headers.txt"; String sinkPath = "src/test/resources/output/bad-fail-headers"; String expectedPath = "src/test/resources/expected/bad-generated-headers.txt"; CSVFormat sourceFormat = CSVFormat.newFormat('\t').withQuote('"').withEscape('\\') .withRecordSeparator('\n'); CSVFormat sinkFormat = CSVFormat.newFormat('\t').withEscape('\\').withRecordSeparator('\n'); testScheme(sourcePath, sourceFormat, sinkPath, sinkFormat, expectedPath, true); }
From source file:edu.isi.misd.scanner.network.modules.worker.processors.ptr.PrepToResearchProcessor.java
private PrepToResearchResponse analyzeFile(PrepToResearchRequest request, File analysisFile) throws Exception { PrepToResearchResponse response = new PrepToResearchResponse(); Integer requestedOmopConceptID = request.getOmopConceptID(); CSVFormat csvFormat = CSVFormat.newFormat(',').withHeader().withCommentMarker('#').withQuote('"'); CSVParser parser = CSVParser.parse(analysisFile, Charset.defaultCharset(), csvFormat); for (CSVRecord csvRecord : parser) { try {/*w w w .j a v a 2 s . c o m*/ this.validateCSVRecord(csvRecord); // check the ID first, if no match continue Integer omopConceptID = Integer .parseInt(csvRecord.get(ExpectedColumnName.OMOP_CONCEPT_ID.toString())); if (!requestedOmopConceptID.equals(omopConceptID)) { continue; } // match found, create response output record if (log.isDebugEnabled()) { log.debug(String.format("Found a match for requested ID %s, record: %s", requestedOmopConceptID, csvRecord.toString())); } PrepToResearchRecord ptrRecord = new PrepToResearchRecord(); ptrRecord.setOmopConceptID(omopConceptID); ptrRecord.setOmopConceptName(csvRecord.get(ExpectedColumnName.OMOP_CONCEPT_NAME)); ptrRecord.setCategory(csvRecord.get(ExpectedColumnName.CATEGORY)); ptrRecord.setCategoryValue(csvRecord.get(ExpectedColumnName.CATEGORY_VALUE)); ptrRecord.setCountFemales(Integer.parseInt(csvRecord.get(ExpectedColumnName.COUNT_FEMALES))); ptrRecord.setCountMales(Integer.parseInt(csvRecord.get(ExpectedColumnName.COUNT_MALES))); ptrRecord.setCountTotal(Integer.parseInt(csvRecord.get(ExpectedColumnName.COUNT_TOTAL))); response.getPrepToResearchRecord().add(ptrRecord); } catch (Exception e) { String error = String.format( "An exception occured while processing row number %s with the following values %s: %s", csvRecord.getRecordNumber(), csvRecord.toString(), e.toString()); parser.close(); throw new RuntimeException(error); } } parser.close(); return response; }
From source file:co.cask.hydrator.transforms.CSVFormatter.java
@Override public void transform(StructuredRecord record, Emitter<StructuredRecord> emitter) throws Exception { String csvRecord = ""; List<Object> values = Lists.newArrayList(); for (Schema.Field field : record.getSchema().getFields()) { values.add(record.get(field.getName())); }// w w w .j ava 2 s .com StringWriter writer = new StringWriter(); CSVPrinter printer = null; CSVFormat csvFileFormat = null; switch (config.format.toLowerCase()) { case "delimited": csvFileFormat = CSVFormat.newFormat(delim).withQuote('"').withRecordSeparator("\r\n") .withIgnoreEmptyLines(); printer = new CSVPrinter(writer, csvFileFormat); break; case "excel": csvFileFormat = CSVFormat.Predefined.Excel.getFormat(); printer = new CSVPrinter(writer, csvFileFormat); break; case "mysql": csvFileFormat = CSVFormat.Predefined.MySQL.getFormat(); printer = new CSVPrinter(writer, csvFileFormat); break; case "tdf": csvFileFormat = CSVFormat.Predefined.TDF.getFormat(); printer = new CSVPrinter(writer, csvFileFormat); break; case "rfc4180": csvFileFormat = CSVFormat.Predefined.TDF.getFormat(); printer = new CSVPrinter(writer, csvFileFormat); break; } if (printer != null) { printer.printRecord(values); csvRecord = writer.toString(); emitter.emit(StructuredRecord.builder(outSchema).set(outSchema.getFields().get(0).getName(), csvRecord) .build()); } }
From source file:com.datascience.cascading.CsvSchemeTest.java
/** * Tests that strict parsing fails on a bad CSV source. *//* ww w . j a va 2 s . c o m*/ @Test public void testBadCsvSourceNotStrict() throws Exception { String sourcePath = "src/test/resources/input/bad-without-headers.txt"; String sinkPath = "src/test/resources/output/bad-generate-headers"; String expectedPath = "src/test/resources/expected/bad-generated-headers.txt"; CSVFormat sourceFormat = CSVFormat.newFormat('\t').withQuote('"').withEscape('\\') .withRecordSeparator('\n'); CSVFormat sinkFormat = CSVFormat.newFormat('\t').withEscape('\\').withRecordSeparator('\n'); testScheme(sourcePath, sourceFormat, sinkPath, sinkFormat, expectedPath, false); }
From source file:co.cask.hydrator.plugin.CSVFormatter.java
@Override public void initialize(TransformContext context) throws Exception { super.initialize(context); try {/* ww w.j a va 2 s . c om*/ outSchema = Schema.parseJson(config.schema); fields = outSchema.getFields(); } catch (IOException e) { throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format."); } // Based on the delimiter name specified pick the delimiter to be used for the record. // This is only applicable when the format type is choosen as DELIMITER char delim = ','; if (delimMap.containsKey(config.delimiter)) { delim = delimMap.get(config.delimiter).charAt(0); } else { throw new IllegalArgumentException("Unknown delimiter '" + config.delimiter + "' specified. "); } // Create CSVFileFormat based on the format specified. switch (config.format.toLowerCase()) { case "delimited": csvFileFormat = CSVFormat.newFormat(delim).withQuote('"').withRecordSeparator("\r\n") .withIgnoreEmptyLines(); break; case "excel": csvFileFormat = CSVFormat.Predefined.Excel.getFormat(); break; case "mysql": csvFileFormat = CSVFormat.Predefined.MySQL.getFormat(); break; case "tdf": csvFileFormat = CSVFormat.Predefined.TDF.getFormat(); break; case "rfc4180": csvFileFormat = CSVFormat.Predefined.TDF.getFormat(); break; default: throw new RuntimeException("Unknown format specified for CSV. Please check the format."); } }
From source file:edu.harvard.mcz.imagecapture.loader.JobVerbatimFieldLoad.java
@Override public void start() { startDateTime = new Date(); Singleton.getSingletonInstance().getJobList().addJob((RunnableJob) this); runStatus = RunStatus.STATUS_RUNNING; String selectedFilename = ""; if (file == null) { final JFileChooser fileChooser = new JFileChooser(); fileChooser.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES); if (Singleton.getSingletonInstance().getProperties().getProperties() .getProperty(ImageCaptureProperties.KEY_LASTLOADPATH) != null) { fileChooser.setCurrentDirectory(new File(Singleton.getSingletonInstance().getProperties() .getProperties().getProperty(ImageCaptureProperties.KEY_LASTLOADPATH))); }//from w w w. j ava 2 s. c om int returnValue = fileChooser.showOpenDialog(Singleton.getSingletonInstance().getMainFrame()); if (returnValue == JFileChooser.APPROVE_OPTION) { file = fileChooser.getSelectedFile(); } } if (file != null) { log.debug("Selected file to load: " + file.getName() + "."); if (file.exists() && file.isFile() && file.canRead()) { // Save location Singleton.getSingletonInstance().getProperties().getProperties() .setProperty(ImageCaptureProperties.KEY_LASTLOADPATH, file.getPath()); selectedFilename = file.getName(); String[] headers = new String[] {}; CSVFormat csvFormat = CSVFormat.DEFAULT.withHeader(headers); int rows = 0; try { rows = readRows(file, csvFormat); } catch (FileNotFoundException e) { JOptionPane.showMessageDialog(Singleton.getSingletonInstance().getMainFrame(), "Unable to load data, file not found: " + e.getMessage(), "Error: File Not Found", JOptionPane.OK_OPTION); errors.append("File not found ").append(e.getMessage()).append("\n"); log.error(e.getMessage(), e); } catch (IOException e) { errors.append("Error loading csv format, trying tab delimited: ").append(e.getMessage()) .append("\n"); log.debug(e.getMessage()); try { // try reading as tab delimited format, if successful, use that format. CSVFormat tabFormat = CSVFormat.newFormat('\t').withIgnoreSurroundingSpaces(true) .withHeader(headers).withQuote('"'); rows = readRows(file, tabFormat); csvFormat = tabFormat; } catch (IOException e1) { errors.append("Error Loading data: ").append(e1.getMessage()).append("\n"); log.error(e.getMessage(), e1); } } try { Reader reader = new FileReader(file); CSVParser csvParser = new CSVParser(reader, csvFormat); Map<String, Integer> csvHeader = csvParser.getHeaderMap(); headers = new String[csvHeader.size()]; int i = 0; for (String header : csvHeader.keySet()) { headers[i++] = header; log.debug(header); } boolean okToRun = true; //TODO: Work picking/checking responsibility into a FieldLoaderWizard List<String> headerList = Arrays.asList(headers); if (!headerList.contains("barcode")) { log.error("Input file " + file.getName() + " header does not contain required field 'barcode'."); // no barcode field, we can't match the input to specimen records. errors.append("Field \"barcode\" not found in csv file headers. Unable to load data.") .append("\n"); okToRun = false; } if (okToRun) { Iterator<CSVRecord> iterator = csvParser.iterator(); FieldLoader fl = new FieldLoader(); if (headerList.size() == 3 && headerList.contains("verbatimUnclassifiedText") && headerList.contains("questions") && headerList.contains("barcode")) { log.debug("Input file matches case 1: Unclassified text only."); // Allowed case 1a: unclassified text only int confirm = JOptionPane.showConfirmDialog( Singleton.getSingletonInstance().getMainFrame(), "Confirm load from file " + selectedFilename + " (" + rows + " rows) with just barcode and verbatimUnclassifiedText", "Verbatim unclassified Field found for load", JOptionPane.OK_CANCEL_OPTION); if (confirm == JOptionPane.OK_OPTION) { String barcode = ""; int lineNumber = 0; while (iterator.hasNext()) { lineNumber++; counter.incrementSpecimens(); CSVRecord record = iterator.next(); try { String verbatimUnclassifiedText = record.get("verbatimUnclassifiedText"); barcode = record.get("barcode"); String questions = record.get("questions"); fl.load(barcode, verbatimUnclassifiedText, questions, true); counter.incrementSpecimensUpdated(); } catch (IllegalArgumentException e) { RunnableJobError error = new RunnableJobError(file.getName(), barcode, Integer.toString(lineNumber), e.getClass().getSimpleName(), e, RunnableJobError.TYPE_LOAD_FAILED); counter.appendError(error); log.error(e.getMessage(), e); } catch (LoadException e) { RunnableJobError error = new RunnableJobError(file.getName(), barcode, Integer.toString(lineNumber), e.getClass().getSimpleName(), e, RunnableJobError.TYPE_LOAD_FAILED); counter.appendError(error); log.error(e.getMessage(), e); } percentComplete = (int) ((lineNumber * 100f) / rows); this.setPercentComplete(percentComplete); } } else { errors.append("Load canceled by user.").append("\n"); } } else if (headerList.size() == 4 && headerList.contains("verbatimUnclassifiedText") && headerList.contains("questions") && headerList.contains("barcode") && headerList.contains("verbatimClusterIdentifier")) { log.debug( "Input file matches case 1: Unclassified text only (with cluster identifier)."); // Allowed case 1b: unclassified text only (including cluster identifier) int confirm = JOptionPane.showConfirmDialog( Singleton.getSingletonInstance().getMainFrame(), "Confirm load from file " + selectedFilename + " (" + rows + " rows) with just barcode and verbatimUnclassifiedText", "Verbatim unclassified Field found for load", JOptionPane.OK_CANCEL_OPTION); if (confirm == JOptionPane.OK_OPTION) { String barcode = ""; int lineNumber = 0; while (iterator.hasNext()) { lineNumber++; counter.incrementSpecimens(); CSVRecord record = iterator.next(); try { String verbatimUnclassifiedText = record.get("verbatimUnclassifiedText"); String verbatimClusterIdentifier = record.get("verbatimClusterIdentifier"); barcode = record.get("barcode"); String questions = record.get("questions"); fl.load(barcode, verbatimUnclassifiedText, verbatimClusterIdentifier, questions, true); counter.incrementSpecimensUpdated(); } catch (IllegalArgumentException e) { RunnableJobError error = new RunnableJobError(file.getName(), barcode, Integer.toString(lineNumber), e.getClass().getSimpleName(), e, RunnableJobError.TYPE_LOAD_FAILED); counter.appendError(error); log.error(e.getMessage(), e); } catch (LoadException e) { RunnableJobError error = new RunnableJobError(file.getName(), barcode, Integer.toString(lineNumber), e.getClass().getSimpleName(), e, RunnableJobError.TYPE_LOAD_FAILED); counter.appendError(error); log.error(e.getMessage(), e); } percentComplete = (int) ((lineNumber * 100f) / rows); this.setPercentComplete(percentComplete); } } else { errors.append("Load canceled by user.").append("\n"); } } else if (headerList.size() == 8 && headerList.contains("verbatimUnclassifiedText") && headerList.contains("questions") && headerList.contains("barcode") && headerList.contains("verbatimLocality") && headerList.contains("verbatimDate") && headerList.contains("verbatimNumbers") && headerList.contains("verbatimCollector") && headerList.contains("verbatimCollection")) { // Allowed case two, transcription into verbatim fields, must be exact list of all // verbatim fields, not including cluster identifier or other metadata. log.debug("Input file matches case 2: Full list of verbatim fields."); int confirm = JOptionPane.showConfirmDialog( Singleton.getSingletonInstance().getMainFrame(), "Confirm load from file " + selectedFilename + " (" + rows + " rows) with just barcode and verbatim fields.", "Verbatim Fields found for load", JOptionPane.OK_CANCEL_OPTION); if (confirm == JOptionPane.OK_OPTION) { String barcode = ""; int lineNumber = 0; while (iterator.hasNext()) { lineNumber++; counter.incrementSpecimens(); CSVRecord record = iterator.next(); try { String verbatimLocality = record.get("verbatimLocality"); String verbatimDate = record.get("verbatimDate"); String verbatimCollector = record.get("verbatimCollector"); String verbatimCollection = record.get("verbatimCollection"); String verbatimNumbers = record.get("verbatimNumbers"); String verbatimUnclasifiedText = record.get("verbatimUnclassifiedText"); barcode = record.get("barcode"); String questions = record.get("questions"); fl.load(barcode, verbatimLocality, verbatimDate, verbatimCollector, verbatimCollection, verbatimNumbers, verbatimUnclasifiedText, questions); counter.incrementSpecimensUpdated(); } catch (IllegalArgumentException e) { RunnableJobError error = new RunnableJobError(file.getName(), barcode, Integer.toString(lineNumber), e.getClass().getSimpleName(), e, RunnableJobError.TYPE_LOAD_FAILED); counter.appendError(error); log.error(e.getMessage(), e); } catch (LoadException e) { RunnableJobError error = new RunnableJobError(file.getName(), barcode, Integer.toString(lineNumber), e.getClass().getSimpleName(), e, RunnableJobError.TYPE_LOAD_FAILED); counter.appendError(error); log.error(e.getMessage(), e); } percentComplete = (int) ((lineNumber * 100f) / rows); this.setPercentComplete(percentComplete); } } else { errors.append("Load canceled by user.").append("\n"); } } else { // allowed case three, transcription into arbitrary sets verbatim or other fields log.debug("Input file case 3: Arbitrary set of fields."); // Check column headers before starting run. boolean headersOK = false; try { HeaderCheckResult headerCheck = fl.checkHeaderList(headerList); if (headerCheck.isResult()) { int confirm = JOptionPane.showConfirmDialog( Singleton.getSingletonInstance().getMainFrame(), "Confirm load from file " + selectedFilename + " (" + rows + " rows) with headers: \n" + headerCheck.getMessage().replaceAll(":", ":\n"), "Fields found for load", JOptionPane.OK_CANCEL_OPTION); if (confirm == JOptionPane.OK_OPTION) { headersOK = true; } else { errors.append("Load canceled by user.").append("\n"); } } else { int confirm = JOptionPane.showConfirmDialog( Singleton.getSingletonInstance().getMainFrame(), "Problem found with headers in file, try to load anyway?\nHeaders: \n" + headerCheck.getMessage().replaceAll(":", ":\n"), "Problem in fields for load", JOptionPane.OK_CANCEL_OPTION); if (confirm == JOptionPane.OK_OPTION) { headersOK = true; } else { errors.append("Load canceled by user.").append("\n"); } } } catch (LoadException e) { errors.append("Error loading data: \n").append(e.getMessage()).append("\n"); JOptionPane.showMessageDialog(Singleton.getSingletonInstance().getMainFrame(), e.getMessage().replaceAll(":", ":\n"), "Error Loading Data: Problem Fields", JOptionPane.ERROR_MESSAGE); log.error(e.getMessage(), e); } if (headersOK) { int lineNumber = 0; while (iterator.hasNext()) { lineNumber++; Map<String, String> data = new HashMap<String, String>(); CSVRecord record = iterator.next(); String barcode = record.get("barcode"); Iterator<String> hi = headerList.iterator(); boolean containsNonVerbatim = false; while (hi.hasNext()) { String header = hi.next(); // Skip any fields prefixed by the underscore character _ if (!header.equals("barcode") && !header.startsWith("_")) { data.put(header, record.get(header)); if (!header.equals("questions") && MetadataRetriever.isFieldExternallyUpdatable(Specimen.class, header) && MetadataRetriever.isFieldVerbatim(Specimen.class, header)) { containsNonVerbatim = true; } } } if (data.size() > 0) { try { boolean updated = false; if (containsNonVerbatim) { updated = fl.loadFromMap(barcode, data, WorkFlowStatus.STAGE_CLASSIFIED, true); } else { updated = fl.loadFromMap(barcode, data, WorkFlowStatus.STAGE_VERBATIM, true); } counter.incrementSpecimens(); if (updated) { counter.incrementSpecimensUpdated(); } } catch (HibernateException e1) { // Catch (should just be development) problems with the underlying query StringBuilder message = new StringBuilder(); message.append("Query Error loading row (").append(lineNumber) .append(")[").append(barcode).append("]") .append(e1.getMessage()); RunnableJobError err = new RunnableJobError(selectedFilename, barcode, Integer.toString(lineNumber), e1.getMessage(), e1, RunnableJobError.TYPE_LOAD_FAILED); counter.appendError(err); log.error(e1.getMessage(), e1); } catch (LoadException e) { StringBuilder message = new StringBuilder(); message.append("Error loading row (").append(lineNumber).append(")[") .append(barcode).append("]").append(e.getMessage()); RunnableJobError err = new RunnableJobError(selectedFilename, barcode, Integer.toString(lineNumber), e.getMessage(), e, RunnableJobError.TYPE_LOAD_FAILED); counter.appendError(err); // errors.append(message.append("\n").toString()); log.error(e.getMessage(), e); } } percentComplete = (int) ((lineNumber * 100f) / rows); this.setPercentComplete(percentComplete); } } else { String message = "Can't load data, problem with headers."; errors.append(message).append("\n"); log.error(message); } } } csvParser.close(); reader.close(); } catch (FileNotFoundException e) { JOptionPane.showMessageDialog(Singleton.getSingletonInstance().getMainFrame(), "Unable to load data, file not found: " + e.getMessage(), "Error: File Not Found", JOptionPane.OK_OPTION); errors.append("File not found ").append(e.getMessage()).append("\n"); log.error(e.getMessage(), e); } catch (IOException e) { errors.append("Error Loading data: ").append(e.getMessage()).append("\n"); log.error(e.getMessage(), e); } } } else { //TODO: handle error condition log.error("File selection cancelled by user."); } report(selectedFilename); done(); }