List of usage examples for org.apache.commons.csv CSVParser iterator
@Override
public Iterator<CSVRecord> iterator()
From source file:ai.grakn.migration.csv.CSVMigrator.java
/** * Each String in the stream is a CSV file * @return stream of parsed insert queries *//*ww w .j a va 2 s. c o m*/ public Stream<Map<String, Object>> convert() { try { CSVParser csvParser = CSVFormat.newFormat(separator).withIgnoreEmptyLines().withEscape('\\') .withFirstRecordAsHeader().withQuote(quote).withNullString(nullString).parse(reader); return stream(csvParser.iterator()).map(this::parse); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:com.xceptance.xlt.common.tests.AbstractURLTestCase.java
/** * Loading of the data. There is a state variable used to indicate that we already did that. * // w w w .jav a2 s .c o m * @throws IOException */ @Before public void loadData() throws IOException { login = getProperty("login", getProperty("com.xceptance.xlt.auth.userName")); password = getProperty("password", getProperty("com.xceptance.xlt.auth.password")); // load the data. Ideally we would offload the file searching to // XltProperties.getDataFile(String name) // or XltProperties.getDataFile(String name, String locale) // or XltProperties.getDataFile(String name, Locale locale) final String dataDirectory = XltProperties.getInstance().getProperty( XltConstants.XLT_PACKAGE_PATH + ".data.directory", "config" + File.separatorChar + "data"); final File file = new File(dataDirectory, getProperty("filename", Session.getCurrent().getUserName() + ".csv")); BufferedReader br = null; boolean incorrectLines = false; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); // permit # as comment, empty lines, set comma as separator, and activate the header final CSVFormat csvFormat = CSVFormat.RFC4180.toBuilder().withIgnoreEmptyLines(true) .withCommentStart('#').withHeader().withIgnoreSurroundingSpaces(true).build(); final CSVParser parser = new CSVParser(br, csvFormat); final Iterator<CSVRecord> csvRecords = parser.iterator(); // verify header fields to avoid problems with incorrect spelling or spaces final Map<String, Integer> headerMap = parser.getHeaderMap(); for (final String headerField : headerMap.keySet()) { if (!CSVBasedURLAction.isPermittedHeaderField(headerField)) { Assert.fail(MessageFormat.format("Unsupported or misspelled header field: {0}", headerField)); } } // go over all lines, this is a little odd, because we have to catch the iterator exception while (true) { try { final boolean hasNext = csvRecords.hasNext(); if (!hasNext) { break; } } catch (final Exception e) { // the plus 1 is meant to correct the increment missing because of the exception throw new RuntimeException( MessageFormat.format("Line at {0} is invalid, because of <{1}>. Line is ignored.", parser.getLineNumber() + 1, e.getMessage())); } final CSVRecord csvRecord = csvRecords.next(); // only take ok lines if (csvRecord.isConsistent()) { // guard against data exceptions try { // do we have an url? if (csvRecord.get(CSVBasedURLAction.URL) != null) { // take it csvBasedActions.add(new CSVBasedURLAction(csvRecord, interpreter)); } else { XltLogger.runTimeLogger.error(MessageFormat.format( "Line at {0} does not contain any URL. Line is ignored: {1}", parser.getLineNumber(), csvRecord)); } } catch (final Exception e) { throw new RuntimeException(MessageFormat.format( "Line at {0} is invalid, because of <{2}>. Line is ignored: {1}", parser.getLineNumber(), csvRecord, e.getMessage())); } } else { XltLogger.runTimeLogger.error(MessageFormat.format( "Line at {0} has not been correctly formatted. Line is ignored: {1}", parser.getLineNumber(), csvRecord)); incorrectLines = true; } } } finally { IOUtils.closeQuietly(br); } // stop if we have anything the is incorrect, avoid half running test cases if (incorrectLines) { throw new RuntimeException("Found incorrectly formatted lines. Stopping here."); } }
From source file:com.danidemi.templategeneratormavenplugin.generation.impl.CsvRowSource.java
@Override public Iterator<IRowModel> iterator() { try {/*from w w w .java 2s . co m*/ // get the reader from the resource CSVParser parser = CSVFormat.RFC4180.withFirstRecordAsHeader().parse(reader); // get the headers List<String> headersAsList = new ArrayList<>(parser.getHeaderMap().keySet()); return new TransformIteratorAdapter<CSVRecord, IRowModel>(parser.iterator(), r -> new CsvRowModel(r, headersAsList)); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:com.marklogic.contentpump.DelimitedTextInputFormat.java
public List<InputSplit> getSplits(JobContext job) throws IOException { boolean delimSplit = isSplitInput(job.getConfiguration()); //if delimSplit is true, size of each split is determined by //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat List<InputSplit> splits = super.getSplits(job); if (!delimSplit) { return splits; }/*w w w. java2s.com*/ if (splits.size() >= SPLIT_COUNT_LIMIT) { //if #splits > 1 million, there is enough parallelism //therefore no point to split LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT); DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT); return splits; } // add header info into splits List<InputSplit> populatedSplits = new ArrayList<InputSplit>(); LOG.info(splits.size() + " DelimitedSplits generated"); Configuration conf = job.getConfiguration(); char delimiter = 0; ArrayList<Text> hlist = new ArrayList<Text>(); for (InputSplit file : splits) { FileSplit fsplit = ((FileSplit) file); Path path = fsplit.getPath(); FileSystem fs = path.getFileSystem(conf); if (fsplit.getStart() == 0) { // parse the inSplit, get the header FSDataInputStream fileIn = fs.open(path); String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER); if (delimStr.length() == 1) { delimiter = delimStr.charAt(0); } else { LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character."); } String encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING, MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING); InputStreamReader instream = new InputStreamReader(fileIn, encoding); CSVParser parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, DelimitedTextReader.encapsulator, true, true)); Iterator<CSVRecord> it = parser.iterator(); String[] header = null; if (it.hasNext()) { CSVRecord record = (CSVRecord) it.next(); Iterator<String> recordIterator = record.iterator(); int recordSize = record.size(); header = new String[recordSize]; for (int i = 0; i < recordSize; i++) { if (recordIterator.hasNext()) { header[i] = (String) recordIterator.next(); } else { throw new IOException("Record size doesn't match the real size"); } } EncodingUtil.handleBOMUTF8(header, 0); hlist.clear(); for (String s : header) { hlist.add(new Text(s)); } } instream.close(); } DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(hlist.toArray(new Text[hlist.size()])), path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations()); populatedSplits.add(ds); } return populatedSplits; }
From source file:data.io.csv.CSVDataReader.java
/** * {@inheritDoc}//from w ww . j a va 2s .c o m * Note : multiple iterators on the same instance are not supported */ @Override public Iterator<MVDataEntry> iterator() { // When a new iterator is requested, everything should be reset CSVParser parser; try { dataSourceStream.reset(); parser = new CSVParser(dataSourceStream, format); } catch (IOException e) { throw new RuntimeException(e); } csvIt = parser.iterator(); nextCSVRecord = null; nextEntry = null; return this; }
From source file:com.publictransitanalytics.scoregenerator.datalayer.directories.GTFSReadingStopTimesDirectory.java
private void parseStopTimesFile(final ImmutableSetMultimap<String, FrequencyRecord> frequencyRecordMap, final Reader stopTimesReader) throws FileNotFoundException, IOException, InterruptedException { final CSVParser parser = new CSVParser(stopTimesReader, CSVFormat.DEFAULT.withHeader()); final SortedSetMultimap<String, RawTripStop> rawTripMap = TreeMultimap.create(Comparator.naturalOrder(), (stop1, stop2) -> Integer.compare(stop1.getSequence(), stop2.getSequence())); final Iterator<CSVRecord> stopTimesIter = parser.iterator(); while (stopTimesIter.hasNext()) { final CSVRecord record = stopTimesIter.next(); final String rawTripId = record.get("trip_id"); final int stopSequence = Integer.valueOf(record.get("stop_sequence")); final String stopId = record.get("stop_id"); final String arrivalTimeString = record.get("arrival_time"); final TransitTime arrivalTime = (arrivalTimeString == null) ? null : TransitTime.parse(arrivalTimeString); final String departureTimeString = record.get("departure_time"); final TransitTime departureTime = (departureTimeString == null) ? null : TransitTime.parse(arrivalTimeString); if (frequencyRecordMap.containsKey(rawTripId)) { final RawTripStop rawTripStop = new RawTripStop(arrivalTime, departureTime, stopId, rawTripId, stopSequence);// w ww.j a v a 2 s.com rawTripMap.put(rawTripId, rawTripStop); } else { final TripId tripId = new TripId(rawTripId); final TripStop tripStop = new TripStop(arrivalTime, stopId, tripId, stopSequence); try { final TripIdKey tripIdKey = new TripIdKey(rawTripId); tripsStore.put(tripIdKey, tripId); tripSequenceStore.put(new TripSequenceKey(tripIdKey, arrivalTime, stopSequence), tripStop); stopTimesStore.put(StopTimeKey.getWriteKey(stopId, arrivalTime), tripStop); } catch (final BitvantageStoreException e) { throw new ScoreGeneratorFatalException(e); } } } for (final String rawTripId : rawTripMap.keySet()) { final ImmutableSet<FrequencyRecord> frequencyRecords = frequencyRecordMap.get(rawTripId); for (final FrequencyRecord frequencyRecord : frequencyRecords) { TransitTime recurringTime = frequencyRecord.getStartTime(); while (recurringTime.isBefore(frequencyRecord.getEndTime())) { final TransitTime baseArrivalTime = rawTripMap.get(rawTripId).first().getArrivalTime(); final TripId tripId = new TripId(rawTripId, recurringTime.toString()); for (final RawTripStop rawTripStop : rawTripMap.get(rawTripId)) { final TransitTime arrivalTime = recurringTime .plus(TransitTime.durationBetween(baseArrivalTime, rawTripStop.getArrivalTime())); final int stopSequence = rawTripStop.getSequence(); final String stopId = rawTripStop.getStopId(); final TripStop tripStop = new TripStop(arrivalTime, stopId, tripId, stopSequence); final TripIdKey tripIdKey = new TripIdKey(tripId.getRawTripId(), tripId.getQualifier()); try { tripsStore.put(tripIdKey, tripId); tripSequenceStore.put(new TripSequenceKey(tripIdKey, arrivalTime, stopSequence), tripStop); stopTimesStore.put(StopTimeKey.getWriteKey(stopId, arrivalTime), tripStop); } catch (final BitvantageStoreException e) { throw new ScoreGeneratorFatalException(e); } } recurringTime = recurringTime.plus(frequencyRecord.getInterval()); } } } }
From source file:com.itemanalysis.jmetrik.data.JmetrikFileImporterTest.java
@Test public void readJmetrikFileTest() { System.out.println("JmetrikFileImporterTest: Reading *.jmetrik file"); CSVParser parser = null; Reader reader = null;//from w ww .j a v a 2 s . com try { File dataFile = FileUtils.toFile(this.getClass().getResource("/data/example-import-file.jmetrik")); reader = new InputStreamReader(new BOMInputStream(new FileInputStream(dataFile)), "UTF-8"); parser = new CSVParser(reader, CSVFormat.DEFAULT.withCommentMarker('#')); Iterator<CSVRecord> iter = parser.iterator(); CSVRecord temp = null; boolean readAttributes = false; boolean readData = false; int attCount = 0; while (iter.hasNext()) { temp = iter.next(); if ("VERSION".equals(temp.getComment())) { System.out.println("VERSION: " + temp.get(0)); } else if ("METADATA".equals(temp.getComment())) { System.out.println("CASES: " + temp.get(0)); } else if ("ATTRIBUTES".equals(temp.getComment())) { readAttributes = true; } else if ("DATA".equals(temp.getComment())) { readAttributes = false; readData = true; } if (readAttributes) { System.out.print("ATTRIBUTE-" + attCount + ": "); Iterator<String> innerIter = temp.iterator(); while (innerIter.hasNext()) { System.out.print(innerIter.next()); if (innerIter.hasNext()) { System.out.print(","); } } System.out.println(); attCount++; } if (readData) { Iterator<String> innerIter = temp.iterator(); while (innerIter.hasNext()) { System.out.print(innerIter.next()); if (innerIter.hasNext()) { System.out.print(","); } } System.out.println(); } } } catch (IOException ex) { ex.printStackTrace(); } finally { try { parser.close(); reader.close(); } catch (IOException ex) { ex.printStackTrace(); } } }
From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.QuestionSetManager.java
/** * This function is responsible for parsing a duplicate Stack Exchange thread TSV file produced by * {@link StackExchangeThreadSerializer}, and partitioning each such thread into the training set, * test set, or validation set. In addition, the corresponding row of the TSV file will be written * out to a training-, test-, or validation-set-specific TSV file in the same directory as the * input TSV file./* www.j av a 2 s. co m*/ * * @param dupQuestionFile - A TSV file containing duplicate {@link StackExchangeThread} records * @param trainTestValidateCumulativeProbs - A CDF of the desired proportion of training, test, * and validation set records * @throws PipelineException */ private void parseTsvAndPartitionRecords(File dupQuestionFile, double[] trainTestValidateCumulativeProbs) throws PipelineException { // Open the TSV file for parsing, and CSVPrinters for outputting train, // test, and validation set // TSV files String baseName = FilenameUtils.removeExtension(dupQuestionFile.getAbsolutePath()); String extension = FilenameUtils.getExtension(dupQuestionFile.getAbsolutePath()); try (FileReader reader = new FileReader(dupQuestionFile); CSVPrinter trainSetPrinter = new CSVPrinter( new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_TRAIN_FILE_SUFFIX + FilenameUtils.EXTENSION_SEPARATOR + extension), CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders())); CSVPrinter testSetPrinter = new CSVPrinter( new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_TEST_FILE_SUFFIX + FilenameUtils.EXTENSION_SEPARATOR + extension), CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders())); CSVPrinter validationSetPrinter = new CSVPrinter( new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_VALIDATE_FILE_SUFFIX + FilenameUtils.EXTENSION_SEPARATOR + extension), CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders()))) { // Parse the duplicate thread TSV file CSVParser parser = CSVFormat.TDF.withHeader().parse(reader); // Iterate over each CSV record, and place into a desired partition // (train, test, or // validation) Iterator<CSVRecord> recordIterator = parser.iterator(); while (recordIterator.hasNext()) { CSVRecord record = recordIterator.next(); // Get the StackExchangeThread associated with this record, and // create a question from it StackExchangeThread duplicateThread = StackExchangeThreadSerializer.deserializeThreadFromBinFile( record.get(CorpusBuilder.TSV_COL_HEADER_SERIALIZED_FILE_PATH)); StackExchangeQuestion duplicateQuestion = new StackExchangeQuestion(duplicateThread); String parentId = record.get(CorpusBuilder.TSV_COL_HEADER_PARENT_ID); // Now drop this question into a partition, and write it to a // corresponding TSV file double p = rng.nextDouble(); // Random number determines // partition for this record if (p <= trainTestValidateCumulativeProbs[0]) { // This record goes in the training set if (!addQuestionToSet(duplicateQuestion, parentId, this.trainingSet)) { throw new PipelineException( MessageFormat.format(Messages.getString("RetrieveAndRank.TRAINING_SET_FAILED_Q"), //$NON-NLS-1$ duplicateThread.getId())); } trainSetPrinter.printRecord((Object[]) convertRecordToArray(record)); } else if (p <= trainTestValidateCumulativeProbs[1]) { // This record goes in the test set if (!addQuestionToSet(duplicateQuestion, parentId, this.testSet)) { throw new PipelineException( MessageFormat.format(Messages.getString("RetrieveAndRank.TEST_SET_FAILED_Q"), //$NON-NLS-1$ duplicateThread.getId())); } testSetPrinter.printRecord((Object[]) convertRecordToArray(record)); } else { // This record goes in the validation set assert (p <= trainTestValidateCumulativeProbs[2]); if (!addQuestionToSet(duplicateQuestion, parentId, this.validationSet)) { throw new PipelineException( MessageFormat.format(Messages.getString("RetrieveAndRank.VALIDATION_SET_FAILED_Q"), //$NON-NLS-1$ duplicateThread.getId())); } validationSetPrinter.printRecord((Object[]) convertRecordToArray(record)); } } // Flush all the printers prior to closing trainSetPrinter.flush(); testSetPrinter.flush(); validationSetPrinter.flush(); } catch (IOException | IngestionException e) { throw new PipelineException(e); } }
From source file:edu.harvard.mcz.imagecapture.loader.JobVerbatimFieldLoad.java
/** * Attempt to read file with a given CSV format, and if successful, return * the number of rows in the file./*from w ww.ja va 2 s . co m*/ * * @param file to check for csv rows. * @param formatToTry the CSV format to try to read the file with. * @return number of rows in the file. * @throws IOException on a problem reading the header. * @throws FileNotFoundException on not finding the file. */ protected int readRows(File file, CSVFormat formatToTry) throws IOException, FileNotFoundException { int rows = 0; Reader reader = new FileReader(file); CSVParser csvParser = new CSVParser(reader, formatToTry); Iterator<CSVRecord> iterator = csvParser.iterator(); while (iterator.hasNext()) { iterator.next(); rows++; } csvParser.close(); reader.close(); return rows; }
From source file:com.itemanalysis.jmetrik.file.JmetrikFileImporter.java
/** * Create a header map to the CSV file, but imposes naming conventions on the column names. * */// w w w. j av a2 s. c o m private void setVariableAttributes() { VariableAttributes variableAttributes = null; int position = 0; Reader reader = null; CSVParser parser = null; VariableName tempName = null; try { reader = new InputStreamReader(new BOMInputStream(new FileInputStream(dataFile)), "UTF-8"); parser = new CSVParser(reader, dataFileFormat.withHeader()); if (hasHeader) { Map<String, Integer> csvMap = parser.getHeaderMap(); for (String s : csvMap.keySet()) { variableAttributes = new VariableAttributes(new VariableName(s), new VariableLabel(""), DataType.INTEGER, position); variableAttributeMap.put(variableAttributes.getName(), variableAttributes); position++; } } else { Iterator<CSVRecord> iter = parser.iterator(); CSVRecord csvRecord = iter.next(); for (int i = 0; i < csvRecord.size(); i++) { variableAttributes = new VariableAttributes(new VariableName("v" + (i + 1)), new VariableLabel(""), DataType.INTEGER, position); variableAttributeMap.put(variableAttributes.getName(), variableAttributes); position++; } } } catch (IOException ex) { theException = ex; } finally { try { if (parser != null) parser.close(); if (reader != null) reader.close(); } catch (IOException ex) { theException = ex; } } }