data.io.csv.CSVDataReader.java Source code

Java tutorial

Introduction

Here is the source code for data.io.csv.CSVDataReader.java

Source

/*
 * SSSync, a Simple and Stupid Synchronizer for data with multi-valued attributes
 * Copyright (C) 2014  Ludovic Pouzenc <ludovic@pouzenc.fr>
 *  
 * This file is part of SSSync.
 *
 *  SSSync is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  SSSync is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with SSSync.  If not, see <http://www.gnu.org/licenses/>
 */
package data.io.csv;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;

import data.MVDataEntry;
import data.io.AbstractMVDataReader;

/**
 * Stream-oriented reader from a particular CSV file.
 * Always returns lines/items sorted by lexicographical ascending key.
 * 
 * @author lpouzenc
 */
public class CSVDataReader extends AbstractMVDataReader {

    public static final String CSV_DEMO =
            //"key,attr,values\n" +
            "line3,hello,all;the;others\n" + "line1,from,csv1;csv1bis\n" + "line2,hello,all;the;world\n"
                    + "line1,attr2,csv1\n" + ",,\n";

    public static final CSVFormat DEFAULT_CSV_FORMAT = CSVFormat.EXCEL.withHeader("key", "attr", "values")
            .withIgnoreSurroundingSpaces(true);

    private final CSVFormat format;
    private final Reader dataSourceStream;

    private transient MVDataEntry nextEntry;
    private transient CSVRecord nextCSVRecord;
    private transient Iterator<CSVRecord> csvIt;

    /**
     * Constructs a CSVDataReader object for parsing a CSV input given via dataSourceStream.
     * @param dataSourceName A short string representing this reader (for logging)
     * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader 
     * @param alreadySorted If false, memory cost is around 3 times the CSV file size !
     * @param format Specify the exact format used to encode the CSV file (separators, escaping...)
     * @throws IOException
     */
    public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted, CSVFormat format)
            throws IOException {
        this.dataSourceName = dataSourceName;
        this.format = format;

        if (alreadySorted) {
            this.dataSourceStream = dataSourceStream;
        } else {
            BufferedReader bufReader;
            if (dataSourceStream instanceof BufferedReader) {
                bufReader = (BufferedReader) dataSourceStream;
            } else {
                bufReader = new BufferedReader(dataSourceStream);
            }
            this.dataSourceStream = readAndSortLines(bufReader);
        }
    }

    /**
     * Constructs a CSVDataReader object with default CSV format (for CSVParser).
     * @param dataSourceName A short string representing this reader (for logging)
     * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader
     * @param alreadySorted If false, memory cost is around 3 times the CSV file size !
     * @throws IOException 
     */
    public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted) throws IOException {
        this(dataSourceName, dataSourceStream, alreadySorted, DEFAULT_CSV_FORMAT);
    }

    /**
     * {@inheritDoc}
     * Note : multiple iterators on the same instance are not supported
     */
    @Override
    public Iterator<MVDataEntry> iterator() {
        // When a new iterator is requested, everything should be reset
        CSVParser parser;
        try {
            dataSourceStream.reset();
            parser = new CSVParser(dataSourceStream, format);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        csvIt = parser.iterator();
        nextCSVRecord = null;
        nextEntry = null;
        return this;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean hasNext() {
        if (nextEntry == null) {
            lookAhead();
        }
        return (nextEntry != null);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public MVDataEntry next() {
        if (!hasNext()) {
            throw new NoSuchElementException();
        }
        // Pop the lookahead record
        MVDataEntry res = nextEntry;
        nextEntry = null;
        // And return it
        return res;
    }

    /**
     * In-memory File sorting, return as a single String
     * @param reader
     * @return
     * @throws IOException
     */
    private Reader readAndSortLines(BufferedReader bufReader) throws IOException {
        // Put all the CSV in memory, in a SortedSet
        SortedSet<String> lineSet = new TreeSet<String>();
        String inputLine;
        int totalCSVSize = 0;
        while ((inputLine = bufReader.readLine()) != null) {
            lineSet.add(inputLine);
            totalCSVSize += inputLine.length() + 1;
        }
        bufReader.close(); // Closes also dataSourceStream

        // Put all sorted lines in a String
        StringBuilder allLines = new StringBuilder(totalCSVSize);
        for (String line : lineSet) {
            allLines.append(line + "\n");
        }
        lineSet = null; // Could help the GC if the input file is huge

        // Build a Java Reader from that String
        return new StringReader(allLines.toString());
    }

    /**
     * A MVDataEntry could be represented on many CSV lines.
     * The key is repeated, the attr could change, the values should change (for given key/attr pair)
     */
    private void lookAhead() {
        MVDataEntry currEntry = null;

        boolean abort = (nextCSVRecord == null && !csvIt.hasNext()); // Nothing to crunch
        boolean done = (nextEntry != null); // Already looked ahead
        while (!abort && !done) {
            // Try to get a valid CSVRecord
            if (nextCSVRecord == null) {
                nextCSVRecord = nextValidCSVRecord();
            }
            // If no more CSV data
            if (nextCSVRecord == null) {
                // Maybe we have a remaining entry to return
                if (currEntry != null) {
                    done = true;
                    continue;
                } else {
                    abort = true;
                    continue;
                }
            }

            // Now we have a valid CSV line to put in a MVDataEntry
            String newKey = nextCSVRecord.get("key");

            // If no MVDataEntry yet, it's time to create it (we have data to put into)
            if (currEntry == null) {
                currEntry = new MVDataEntry(newKey);
            }
            // If CSV line key matches MVDataEntry key, appends attr/values on it
            // XXX Tricky code : following condition is always true if the previous one is true
            if (currEntry.getKey().equals(newKey)) {
                currEntry.splitAndPut(nextCSVRecord.get("attr"), nextCSVRecord.get("values"), ";");
                nextCSVRecord = null; // Record consumed
            } else {
                // Keys are different, we are done (and we have remaining CSV data in nextCSVRecord)
                done = true;
                continue;
            }
        }

        nextEntry = done ? currEntry : null;
    }

    /**
     * Seek for the next valid record in the CSV file
     * @return the next valid CSVRecord
     */
    private CSVRecord nextValidCSVRecord() {
        CSVRecord res = null;
        boolean abort = !csvIt.hasNext();
        boolean done = false;
        //TODO : re-think logic : csvIt will not return nulls but throws IOException wrapped in RuntimeException
        while (!abort && !done) {
            // Try to read a CSV line
            res = (csvIt.hasNext()) ? csvIt.next() : null;

            // Break if nothing readable
            if (res == null) {
                abort = true;
                continue;
            }

            // Skip invalid and empty lines
            String key = res.get("key");
            if (key != null && !key.isEmpty()) {
                done = true;
                continue;
            }
        }

        return done ? res : null;
    }
}