List of usage examples for org.apache.commons.csv CSVFormat parse
public CSVParser parse(final Reader in) throws IOException
From source file:org.transitime.custom.sfmta.delayTimes.Loc.java
public static List<Loc> readLocs(String fileName) { List<Loc> locs = new ArrayList<Loc>(); try {// www. jav a 2 s . c om Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8")); CSVFormat formatter = CSVFormat.DEFAULT.withHeader().withCommentMarker('-'); // Parse the file Iterable<CSVRecord> records = formatter.parse(in); Iterator<CSVRecord> iterator = records.iterator(); while (iterator.hasNext()) { // Determine the record to process CSVRecord record = iterator.next(); Loc loc = getLoc(record); if (loc.accuracy < MAX_ALLOWED_ACCURACY) locs.add(loc); } } catch (Exception e) { e.printStackTrace(); } return locs; }
From source file:org.transitime.utils.csv.CsvBaseReader.java
/** * Parse the CSV file. Reads in the header info and then each line. Calls * the abstract handleRecord() method for each record. Adds each resulting * CSV object to the gtfsObjecgts array. */// ww w . ja va 2 s .c om private void parse() { CSVRecord record = null; try { IntervalTimer timer = new IntervalTimer(); logger.debug("Parsing CSV file {} ...", fileName); // Open the file for reading. Use UTF-8 format since that will work // for both regular ASCII format and UTF-8 extended format files // since UTF-8 was designed to be backwards compatible with ASCII. // This way will work for Chinese and other character sets. Use // InputStreamReader so can specify that using UTF-8 format. Use // BufferedReader so that can determine if first character is an // optional BOM (Byte Order Mark) character used to indicate that // file is in UTF-8 format. BufferedReader allows us to read in // first character and then discard if it is a BOM character or // reset the reader to back to the beginning if it is not. This // way the CSV parser will process the file starting with the first // true character. Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8")); // Deal with the possible BOM character at the beginning of the file in.mark(1); int firstRead = in.read(); final int BOM_CHARACTER = 0xFEFF; if (firstRead != BOM_CHARACTER) in.reset(); // Get ready to parse the CSV file. // Allow lines to be comments if they start with "-" so that can // easily comment out problems and also test what happens when // certain data is missing. Using the '-' character so can // comment out line that starts with "--", which is what is // used for SQL. CSVFormat formatter = CSVFormat.DEFAULT.withHeader().withCommentMarker('-'); // Parse the file Iterable<CSVRecord> records = formatter.parse(in); logger.debug("Finished CSV parsing of file {}. Took {} msec.", fileName, timer.elapsedMsec()); int lineNumberWhenLogged = 0; timer = new IntervalTimer(); IntervalTimer loggingTimer = new IntervalTimer(); Iterator<CSVRecord> iterator = records.iterator(); while (iterator.hasNext()) { // Determine the record to process record = iterator.next(); // If blank line then skip it. This way avoid error messages since // expected data column won't exist if (record.size() == 0) continue; // Process the record using appropriate handler // and create the corresponding CSV object T gtfsObject; try { gtfsObject = handleRecord(record, supplemental); } catch (ParseException e) { logger.error("ParseException occurred for record {} " + "(comment lines not included when determing record #) for " + "filename {} . {}", record.getRecordNumber(), fileName, e.getMessage()); // Continue even though there was an error so that all errors // logged at once. continue; } catch (NumberFormatException e) { logger.error("NumberFormatException occurred for record {} " + "(comment lines not included when determing record #) " + "for filename {} . {}", record.getRecordNumber(), fileName, e.getMessage()); // Continue even though there was an error so that all errors // logged at once. continue; } // Add the newly created CSV object to the object list if (gtfsObject != null) gtfsObjects.add(gtfsObject); // Log info if it has been a while. Check only every 20,000 // lines to see if the 10 seconds has gone by. If so, then log // number of lines. By only looking at timer every 20,000 lines // not slowing things down by for every line doing system call // for to get current time. final int LINES_TO_PROCESS_BEFORE_CHECKING_IF_SHOULD_LOG = 20000; final long SECONDS_ELSAPSED_UNTIL_SHOULD_LOG = 5; if (record.getRecordNumber() >= lineNumberWhenLogged + LINES_TO_PROCESS_BEFORE_CHECKING_IF_SHOULD_LOG) { lineNumberWhenLogged = (int) record.getRecordNumber(); if (loggingTimer.elapsedMsec() > SECONDS_ELSAPSED_UNTIL_SHOULD_LOG * Time.MS_PER_SEC) { logger.info(" Processed {} lines. Took {} msec...", lineNumberWhenLogged, timer.elapsedMsec()); loggingTimer = new IntervalTimer(); } } } // End of while iterating over records // Close up the file reader in.close(); // Determine number of records for logging message long numberRecords = 0; if (record != null) numberRecords = record.getRecordNumber(); logger.info("Finished parsing {} records from file {} . Took {} msec.", numberRecords, fileName, timer.elapsedMsec()); } catch (FileNotFoundException e) { if (required) logger.error("Required CSV file {} not found.", fileName); else logger.info("CSV file {} not found but OK because this file " + "not required.", fileName); } catch (IOException e) { logger.error("IOException occurred when reading in filename {}.", fileName, e); } }
From source file:permafrost.tundra.data.IDataCSVParser.java
/** * Returns an IData representation of the CSV data in the given input stream. * * @param inputStream The input stream to be decoded. * @param charset The character set to use. * @return An IData representation of the given input stream data. * @throws IOException If there is a problem reading from the stream. *//*from w ww .j a v a 2s .c om*/ @Override public IData decode(InputStream inputStream, Charset charset) throws IOException { if (inputStream == null) return null; Reader reader = new InputStreamReader(inputStream, CharsetHelper.normalize(charset)); CSVFormat format = CSVFormat.DEFAULT.withHeader().withDelimiter(delimiter).withNullString(""); CSVParser parser = format.parse(reader); Set<String> keys = parser.getHeaderMap().keySet(); List<IData> list = new ArrayList<IData>(); for (CSVRecord record : parser) { IData document = IDataFactory.create(); IDataCursor cursor = document.getCursor(); for (String key : keys) { if (record.isSet(key)) { String value = record.get(key); if (value != null) IDataUtil.put(cursor, key, value); } } cursor.destroy(); list.add(document); } IData output = IDataFactory.create(); IDataCursor cursor = output.getCursor(); IDataUtil.put(cursor, "recordWithNoID", list.toArray(new IData[list.size()])); return output; }
From source file:umich.ms.batmass.filesupport.files.types.mzrt.model.MzrtFile.java
public void load() throws DataLoadingException { int[] counts = new int[3]; // [0] - \r\n, [1] - \n, [2] - \r final String[] separators = { "\r\n", "\n", "\r" }; // detecting line separator try (InputStreamReader isr = new InputStreamReader( new BufferedInputStream(new FileInputStream(file.toFile())), Charsets.UTF_8)) { int c;/* ww w. jav a2s . c o m*/ int encountered = 0; boolean isPrevR = false; int cutoff = 50; readLoop: while ((c = isr.read()) != -1) { char ch = (char) c; switch (ch) { case '\r': if (++counts[2] > cutoff) { break readLoop; } isPrevR = true; break; case '\n': if (isPrevR) { counts[2]--; if (++counts[0] > cutoff) { break readLoop; } } else { if (++counts[1] > cutoff) { break readLoop; } } isPrevR = false; break; default: isPrevR = false; } } } catch (IOException ex) { throw new DataLoadingException("Could not detect line separator", ex); } List<Integer> idxMax = new ArrayList<>(); for (int i = 0; i < counts.length; i++) { if (idxMax.isEmpty()) { idxMax.add(i); } else if (counts[i] > counts[idxMax.get(0)]) { idxMax.clear(); idxMax.add(i); } else if (counts[i] == counts[idxMax.get(0)]) { idxMax.add(i); } } String recordSeparator; if (idxMax.size() > 1) { if (idxMax.contains(0)) { recordSeparator = separators[0]; } else if (idxMax.contains(1)) { recordSeparator = separators[1]; } else { recordSeparator = separators[idxMax.get(0)]; } } else { recordSeparator = separators[idxMax.get(0)]; } // detecting delimiter char delimiter; try (BufferedReader br = new BufferedReader(new FileReader(file.toFile()))) { List<String> lines = new ArrayList<>(); String line; int numTestLines = 10; while ((line = br.readLine()) != null) { if (!line.isEmpty()) { lines.add(line); if (lines.size() >= numTestLines) break; } } delimiter = guessDelimiter(lines); } catch (IOException ex) { throw new DataLoadingException("Could not detect delimiter character", ex); } try (BufferedReader br = new BufferedReader(new FileReader(file.toFile()))) { CSVFormat fmt = CSVFormat.newFormat(delimiter); fmt = fmt.withHeader().withIgnoreEmptyLines(true).withTrim(true).withIgnoreHeaderCase(true) .withQuoteMode(QuoteMode.NON_NUMERIC).withRecordSeparator(recordSeparator).withQuote('"'); CSVParser parser = fmt.parse(br); records = parser.getRecords(); header = parser.getHeaderMap(); String[] colNames = { HEAD_MZLO, HEAD_MZHI, HEAD_RTLO, HEAD_RTHI }; for (int i = 0; i < colNames.length; i++) { Integer index = header.get(colNames[i]); if (index == null) throw new DataLoadingException(String.format("Missing header column [%s]", colNames[i])); indexesMzRtColorOpacity[i] = index; } Integer indexColor = header.get(HEAD_COLOR); if (indexColor != null && indexColor >= 0) indexesMzRtColorOpacity[4] = indexColor; Integer indexOpacity = header.get(HEAD_OPACITY); if (indexOpacity != null && indexOpacity >= 0) indexesMzRtColorOpacity[5] = indexOpacity; } catch (IOException ex) { throw new DataLoadingException(ex); } }