Example usage for org.apache.commons.csv CSVParser iterator

List of usage examples for org.apache.commons.csv CSVParser iterator

Introduction

In this page you can find the example usage for org.apache.commons.csv CSVParser iterator.

Prototype

@Override
public Iterator<CSVRecord> iterator() 

Source Link

Document

Returns an iterator on the records.

Usage

From source file:ai.grakn.migration.csv.CSVMigrator.java

/**
 * Each String in the stream is a CSV file
 * @return stream of parsed insert queries
 *//*ww  w  .j  a  va 2  s. c  o m*/
public Stream<Map<String, Object>> convert() {
    try {
        CSVParser csvParser = CSVFormat.newFormat(separator).withIgnoreEmptyLines().withEscape('\\')
                .withFirstRecordAsHeader().withQuote(quote).withNullString(nullString).parse(reader);

        return stream(csvParser.iterator()).map(this::parse);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:com.xceptance.xlt.common.tests.AbstractURLTestCase.java

/**
 * Loading of the data. There is a state variable used to indicate that we already did that.
 * //  w w  w .jav  a2 s .c o  m
 * @throws IOException
 */
@Before
public void loadData() throws IOException {
    login = getProperty("login", getProperty("com.xceptance.xlt.auth.userName"));
    password = getProperty("password", getProperty("com.xceptance.xlt.auth.password"));

    // load the data. Ideally we would offload the file searching to
    // XltProperties.getDataFile(String name)
    // or XltProperties.getDataFile(String name, String locale)
    // or XltProperties.getDataFile(String name, Locale locale)
    final String dataDirectory = XltProperties.getInstance().getProperty(
            XltConstants.XLT_PACKAGE_PATH + ".data.directory", "config" + File.separatorChar + "data");
    final File file = new File(dataDirectory,
            getProperty("filename", Session.getCurrent().getUserName() + ".csv"));

    BufferedReader br = null;
    boolean incorrectLines = false;

    try {
        br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));

        // permit # as comment, empty lines, set comma as separator, and activate the header
        final CSVFormat csvFormat = CSVFormat.RFC4180.toBuilder().withIgnoreEmptyLines(true)
                .withCommentStart('#').withHeader().withIgnoreSurroundingSpaces(true).build();
        final CSVParser parser = new CSVParser(br, csvFormat);
        final Iterator<CSVRecord> csvRecords = parser.iterator();

        // verify header fields to avoid problems with incorrect spelling or spaces
        final Map<String, Integer> headerMap = parser.getHeaderMap();

        for (final String headerField : headerMap.keySet()) {
            if (!CSVBasedURLAction.isPermittedHeaderField(headerField)) {
                Assert.fail(MessageFormat.format("Unsupported or misspelled header field: {0}", headerField));
            }
        }

        // go over all lines, this is a little odd, because we have to catch the iterator exception
        while (true) {
            try {
                final boolean hasNext = csvRecords.hasNext();
                if (!hasNext) {
                    break;
                }
            } catch (final Exception e) {
                // the plus 1 is meant to correct the increment missing because of the exception
                throw new RuntimeException(
                        MessageFormat.format("Line at {0} is invalid, because of <{1}>. Line is ignored.",
                                parser.getLineNumber() + 1, e.getMessage()));
            }

            final CSVRecord csvRecord = csvRecords.next();

            // only take ok lines
            if (csvRecord.isConsistent()) {
                // guard against data exceptions
                try {
                    // do we have an url?
                    if (csvRecord.get(CSVBasedURLAction.URL) != null) {
                        // take it
                        csvBasedActions.add(new CSVBasedURLAction(csvRecord, interpreter));
                    } else {
                        XltLogger.runTimeLogger.error(MessageFormat.format(
                                "Line at {0} does not contain any URL. Line is ignored: {1}",
                                parser.getLineNumber(), csvRecord));
                    }
                } catch (final Exception e) {
                    throw new RuntimeException(MessageFormat.format(
                            "Line at {0} is invalid, because of <{2}>. Line is ignored: {1}",
                            parser.getLineNumber(), csvRecord, e.getMessage()));
                }
            } else {
                XltLogger.runTimeLogger.error(MessageFormat.format(
                        "Line at {0} has not been correctly formatted. Line is ignored: {1}",
                        parser.getLineNumber(), csvRecord));
                incorrectLines = true;
            }
        }
    } finally {
        IOUtils.closeQuietly(br);
    }

    // stop if we have anything the is incorrect, avoid half running test cases
    if (incorrectLines) {
        throw new RuntimeException("Found incorrectly formatted lines. Stopping here.");
    }
}

From source file:com.danidemi.templategeneratormavenplugin.generation.impl.CsvRowSource.java

@Override
public Iterator<IRowModel> iterator() {

    try {/*from w w  w  .java  2s . co m*/
        // get the reader from the resource
        CSVParser parser = CSVFormat.RFC4180.withFirstRecordAsHeader().parse(reader);

        // get the headers
        List<String> headersAsList = new ArrayList<>(parser.getHeaderMap().keySet());

        return new TransformIteratorAdapter<CSVRecord, IRowModel>(parser.iterator(),
                r -> new CsvRowModel(r, headersAsList));
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

}

From source file:com.marklogic.contentpump.DelimitedTextInputFormat.java

public List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean delimSplit = isSplitInput(job.getConfiguration());
    //if delimSplit is true, size of each split is determined by 
    //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
    List<InputSplit> splits = super.getSplits(job);
    if (!delimSplit) {
        return splits;
    }/*w w w.  java2s.com*/

    if (splits.size() >= SPLIT_COUNT_LIMIT) {
        //if #splits > 1 million, there is enough parallelism
        //therefore no point to split
        LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT);
        DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
        return splits;
    }
    // add header info into splits
    List<InputSplit> populatedSplits = new ArrayList<InputSplit>();
    LOG.info(splits.size() + " DelimitedSplits generated");
    Configuration conf = job.getConfiguration();
    char delimiter = 0;
    ArrayList<Text> hlist = new ArrayList<Text>();
    for (InputSplit file : splits) {
        FileSplit fsplit = ((FileSplit) file);
        Path path = fsplit.getPath();
        FileSystem fs = path.getFileSystem(conf);

        if (fsplit.getStart() == 0) {
            // parse the inSplit, get the header
            FSDataInputStream fileIn = fs.open(path);

            String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER);
            if (delimStr.length() == 1) {
                delimiter = delimStr.charAt(0);
            } else {
                LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character.");
            }
            String encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                    MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
            InputStreamReader instream = new InputStreamReader(fileIn, encoding);
            CSVParser parser = new CSVParser(instream,
                    CSVParserFormatter.getFormat(delimiter, DelimitedTextReader.encapsulator, true, true));
            Iterator<CSVRecord> it = parser.iterator();

            String[] header = null;
            if (it.hasNext()) {
                CSVRecord record = (CSVRecord) it.next();
                Iterator<String> recordIterator = record.iterator();
                int recordSize = record.size();
                header = new String[recordSize];
                for (int i = 0; i < recordSize; i++) {
                    if (recordIterator.hasNext()) {
                        header[i] = (String) recordIterator.next();
                    } else {
                        throw new IOException("Record size doesn't match the real size");
                    }
                }

                EncodingUtil.handleBOMUTF8(header, 0);

                hlist.clear();
                for (String s : header) {
                    hlist.add(new Text(s));
                }
            }
            instream.close();
        }

        DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(hlist.toArray(new Text[hlist.size()])),
                path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations());
        populatedSplits.add(ds);
    }

    return populatedSplits;
}

From source file:data.io.csv.CSVDataReader.java

/**
 * {@inheritDoc}//from w ww  .  j a  va  2s .c  o m
 * Note : multiple iterators on the same instance are not supported
 */
@Override
public Iterator<MVDataEntry> iterator() {
    // When a new iterator is requested, everything should be reset
    CSVParser parser;
    try {
        dataSourceStream.reset();
        parser = new CSVParser(dataSourceStream, format);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    csvIt = parser.iterator();
    nextCSVRecord = null;
    nextEntry = null;
    return this;
}

From source file:com.publictransitanalytics.scoregenerator.datalayer.directories.GTFSReadingStopTimesDirectory.java

private void parseStopTimesFile(final ImmutableSetMultimap<String, FrequencyRecord> frequencyRecordMap,
        final Reader stopTimesReader) throws FileNotFoundException, IOException, InterruptedException {

    final CSVParser parser = new CSVParser(stopTimesReader, CSVFormat.DEFAULT.withHeader());

    final SortedSetMultimap<String, RawTripStop> rawTripMap = TreeMultimap.create(Comparator.naturalOrder(),
            (stop1, stop2) -> Integer.compare(stop1.getSequence(), stop2.getSequence()));

    final Iterator<CSVRecord> stopTimesIter = parser.iterator();
    while (stopTimesIter.hasNext()) {
        final CSVRecord record = stopTimesIter.next();
        final String rawTripId = record.get("trip_id");
        final int stopSequence = Integer.valueOf(record.get("stop_sequence"));
        final String stopId = record.get("stop_id");
        final String arrivalTimeString = record.get("arrival_time");
        final TransitTime arrivalTime = (arrivalTimeString == null) ? null
                : TransitTime.parse(arrivalTimeString);
        final String departureTimeString = record.get("departure_time");
        final TransitTime departureTime = (departureTimeString == null) ? null
                : TransitTime.parse(arrivalTimeString);

        if (frequencyRecordMap.containsKey(rawTripId)) {
            final RawTripStop rawTripStop = new RawTripStop(arrivalTime, departureTime, stopId, rawTripId,
                    stopSequence);// w ww.j  a  v a  2 s.com
            rawTripMap.put(rawTripId, rawTripStop);
        } else {
            final TripId tripId = new TripId(rawTripId);
            final TripStop tripStop = new TripStop(arrivalTime, stopId, tripId, stopSequence);
            try {
                final TripIdKey tripIdKey = new TripIdKey(rawTripId);
                tripsStore.put(tripIdKey, tripId);
                tripSequenceStore.put(new TripSequenceKey(tripIdKey, arrivalTime, stopSequence), tripStop);
                stopTimesStore.put(StopTimeKey.getWriteKey(stopId, arrivalTime), tripStop);
            } catch (final BitvantageStoreException e) {
                throw new ScoreGeneratorFatalException(e);
            }
        }
    }
    for (final String rawTripId : rawTripMap.keySet()) {
        final ImmutableSet<FrequencyRecord> frequencyRecords = frequencyRecordMap.get(rawTripId);
        for (final FrequencyRecord frequencyRecord : frequencyRecords) {

            TransitTime recurringTime = frequencyRecord.getStartTime();
            while (recurringTime.isBefore(frequencyRecord.getEndTime())) {
                final TransitTime baseArrivalTime = rawTripMap.get(rawTripId).first().getArrivalTime();
                final TripId tripId = new TripId(rawTripId, recurringTime.toString());

                for (final RawTripStop rawTripStop : rawTripMap.get(rawTripId)) {
                    final TransitTime arrivalTime = recurringTime
                            .plus(TransitTime.durationBetween(baseArrivalTime, rawTripStop.getArrivalTime()));
                    final int stopSequence = rawTripStop.getSequence();
                    final String stopId = rawTripStop.getStopId();

                    final TripStop tripStop = new TripStop(arrivalTime, stopId, tripId, stopSequence);

                    final TripIdKey tripIdKey = new TripIdKey(tripId.getRawTripId(), tripId.getQualifier());

                    try {
                        tripsStore.put(tripIdKey, tripId);
                        tripSequenceStore.put(new TripSequenceKey(tripIdKey, arrivalTime, stopSequence),
                                tripStop);
                        stopTimesStore.put(StopTimeKey.getWriteKey(stopId, arrivalTime), tripStop);
                    } catch (final BitvantageStoreException e) {
                        throw new ScoreGeneratorFatalException(e);
                    }
                }
                recurringTime = recurringTime.plus(frequencyRecord.getInterval());
            }
        }
    }
}

From source file:com.itemanalysis.jmetrik.data.JmetrikFileImporterTest.java

@Test
public void readJmetrikFileTest() {
    System.out.println("JmetrikFileImporterTest: Reading *.jmetrik file");
    CSVParser parser = null;
    Reader reader = null;//from  w ww  .j a  v  a  2 s .  com

    try {
        File dataFile = FileUtils.toFile(this.getClass().getResource("/data/example-import-file.jmetrik"));
        reader = new InputStreamReader(new BOMInputStream(new FileInputStream(dataFile)), "UTF-8");

        parser = new CSVParser(reader, CSVFormat.DEFAULT.withCommentMarker('#'));
        Iterator<CSVRecord> iter = parser.iterator();
        CSVRecord temp = null;

        boolean readAttributes = false;
        boolean readData = false;
        int attCount = 0;

        while (iter.hasNext()) {
            temp = iter.next();

            if ("VERSION".equals(temp.getComment())) {
                System.out.println("VERSION: " + temp.get(0));

            } else if ("METADATA".equals(temp.getComment())) {
                System.out.println("CASES: " + temp.get(0));
            } else if ("ATTRIBUTES".equals(temp.getComment())) {
                readAttributes = true;
            } else if ("DATA".equals(temp.getComment())) {
                readAttributes = false;
                readData = true;
            }

            if (readAttributes) {
                System.out.print("ATTRIBUTE-" + attCount + ": ");
                Iterator<String> innerIter = temp.iterator();
                while (innerIter.hasNext()) {
                    System.out.print(innerIter.next());
                    if (innerIter.hasNext()) {
                        System.out.print(",");
                    }
                }
                System.out.println();
                attCount++;
            }

            if (readData) {
                Iterator<String> innerIter = temp.iterator();
                while (innerIter.hasNext()) {
                    System.out.print(innerIter.next());
                    if (innerIter.hasNext()) {
                        System.out.print(",");
                    }
                }
                System.out.println();
            }

        }

    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        try {
            parser.close();
            reader.close();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

}

From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.QuestionSetManager.java

/**
 * This function is responsible for parsing a duplicate Stack Exchange thread TSV file produced by
 * {@link StackExchangeThreadSerializer}, and partitioning each such thread into the training set,
 * test set, or validation set. In addition, the corresponding row of the TSV file will be written
 * out to a training-, test-, or validation-set-specific TSV file in the same directory as the
 * input TSV file./* www.j av  a 2 s.  co  m*/
 * 
 * @param dupQuestionFile - A TSV file containing duplicate {@link StackExchangeThread} records
 * @param trainTestValidateCumulativeProbs - A CDF of the desired proportion of training, test,
 *        and validation set records
 * @throws PipelineException
 */
private void parseTsvAndPartitionRecords(File dupQuestionFile, double[] trainTestValidateCumulativeProbs)
        throws PipelineException {
    // Open the TSV file for parsing, and CSVPrinters for outputting train,
    // test, and validation set
    // TSV files
    String baseName = FilenameUtils.removeExtension(dupQuestionFile.getAbsolutePath());
    String extension = FilenameUtils.getExtension(dupQuestionFile.getAbsolutePath());
    try (FileReader reader = new FileReader(dupQuestionFile);
            CSVPrinter trainSetPrinter = new CSVPrinter(
                    new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_TRAIN_FILE_SUFFIX
                            + FilenameUtils.EXTENSION_SEPARATOR + extension),
                    CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders()));
            CSVPrinter testSetPrinter = new CSVPrinter(
                    new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_TEST_FILE_SUFFIX
                            + FilenameUtils.EXTENSION_SEPARATOR + extension),
                    CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders()));
            CSVPrinter validationSetPrinter = new CSVPrinter(
                    new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_VALIDATE_FILE_SUFFIX
                            + FilenameUtils.EXTENSION_SEPARATOR + extension),
                    CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders()))) {

        // Parse the duplicate thread TSV file
        CSVParser parser = CSVFormat.TDF.withHeader().parse(reader);

        // Iterate over each CSV record, and place into a desired partition
        // (train, test, or
        // validation)
        Iterator<CSVRecord> recordIterator = parser.iterator();
        while (recordIterator.hasNext()) {
            CSVRecord record = recordIterator.next();

            // Get the StackExchangeThread associated with this record, and
            // create a question from it
            StackExchangeThread duplicateThread = StackExchangeThreadSerializer.deserializeThreadFromBinFile(
                    record.get(CorpusBuilder.TSV_COL_HEADER_SERIALIZED_FILE_PATH));
            StackExchangeQuestion duplicateQuestion = new StackExchangeQuestion(duplicateThread);
            String parentId = record.get(CorpusBuilder.TSV_COL_HEADER_PARENT_ID);

            // Now drop this question into a partition, and write it to a
            // corresponding TSV file
            double p = rng.nextDouble(); // Random number determines
            // partition for this record
            if (p <= trainTestValidateCumulativeProbs[0]) {
                // This record goes in the training set
                if (!addQuestionToSet(duplicateQuestion, parentId, this.trainingSet)) {
                    throw new PipelineException(
                            MessageFormat.format(Messages.getString("RetrieveAndRank.TRAINING_SET_FAILED_Q"), //$NON-NLS-1$
                                    duplicateThread.getId()));
                }
                trainSetPrinter.printRecord((Object[]) convertRecordToArray(record));
            } else if (p <= trainTestValidateCumulativeProbs[1]) {
                // This record goes in the test set
                if (!addQuestionToSet(duplicateQuestion, parentId, this.testSet)) {
                    throw new PipelineException(
                            MessageFormat.format(Messages.getString("RetrieveAndRank.TEST_SET_FAILED_Q"), //$NON-NLS-1$
                                    duplicateThread.getId()));
                }
                testSetPrinter.printRecord((Object[]) convertRecordToArray(record));
            } else {
                // This record goes in the validation set
                assert (p <= trainTestValidateCumulativeProbs[2]);
                if (!addQuestionToSet(duplicateQuestion, parentId, this.validationSet)) {
                    throw new PipelineException(
                            MessageFormat.format(Messages.getString("RetrieveAndRank.VALIDATION_SET_FAILED_Q"), //$NON-NLS-1$
                                    duplicateThread.getId()));
                }
                validationSetPrinter.printRecord((Object[]) convertRecordToArray(record));
            }
        }

        // Flush all the printers prior to closing
        trainSetPrinter.flush();
        testSetPrinter.flush();
        validationSetPrinter.flush();
    } catch (IOException | IngestionException e) {
        throw new PipelineException(e);
    }
}

From source file:edu.harvard.mcz.imagecapture.loader.JobVerbatimFieldLoad.java

/**
 * Attempt to read file with a given CSV format, and if successful, return
 * the number of rows in the file./*from   w ww.ja  va 2 s .  co m*/
 * 
 * @param file to check for csv rows.
 * @param formatToTry the CSV format to try to read the file with.
 * @return number of rows in the file.
 * @throws IOException on a problem reading the header.
 * @throws FileNotFoundException on not finding the file.
 */
protected int readRows(File file, CSVFormat formatToTry) throws IOException, FileNotFoundException {
    int rows = 0;
    Reader reader = new FileReader(file);

    CSVParser csvParser = new CSVParser(reader, formatToTry);
    Iterator<CSVRecord> iterator = csvParser.iterator();
    while (iterator.hasNext()) {
        iterator.next();
        rows++;
    }
    csvParser.close();
    reader.close();
    return rows;
}

From source file:com.itemanalysis.jmetrik.file.JmetrikFileImporter.java

/**
 * Create a header map to the CSV file, but imposes naming conventions on the column names.
 *
 */// w  w  w.  j av  a2  s. c o  m
private void setVariableAttributes() {
    VariableAttributes variableAttributes = null;
    int position = 0;

    Reader reader = null;
    CSVParser parser = null;
    VariableName tempName = null;

    try {
        reader = new InputStreamReader(new BOMInputStream(new FileInputStream(dataFile)), "UTF-8");
        parser = new CSVParser(reader, dataFileFormat.withHeader());

        if (hasHeader) {
            Map<String, Integer> csvMap = parser.getHeaderMap();
            for (String s : csvMap.keySet()) {
                variableAttributes = new VariableAttributes(new VariableName(s), new VariableLabel(""),
                        DataType.INTEGER, position);
                variableAttributeMap.put(variableAttributes.getName(), variableAttributes);
                position++;
            }
        } else {
            Iterator<CSVRecord> iter = parser.iterator();
            CSVRecord csvRecord = iter.next();

            for (int i = 0; i < csvRecord.size(); i++) {
                variableAttributes = new VariableAttributes(new VariableName("v" + (i + 1)),
                        new VariableLabel(""), DataType.INTEGER, position);
                variableAttributeMap.put(variableAttributes.getName(), variableAttributes);
                position++;
            }
        }

    } catch (IOException ex) {
        theException = ex;
    } finally {
        try {
            if (parser != null)
                parser.close();
            if (reader != null)
                reader.close();
        } catch (IOException ex) {
            theException = ex;
        }
    }
}