Java tutorial
/* * SSSync, a Simple and Stupid Synchronizer for data with multi-valued attributes * Copyright (C) 2014 Ludovic Pouzenc <ludovic@pouzenc.fr> * * This file is part of SSSync. * * SSSync is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SSSync is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SSSync. If not, see <http://www.gnu.org/licenses/> */ package data.io.csv; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.SortedSet; import java.util.TreeSet; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import data.MVDataEntry; import data.io.AbstractMVDataReader; /** * Stream-oriented reader from a particular CSV file. * Always returns lines/items sorted by lexicographical ascending key. * * @author lpouzenc */ public class CSVDataReader extends AbstractMVDataReader { public static final String CSV_DEMO = //"key,attr,values\n" + "line3,hello,all;the;others\n" + "line1,from,csv1;csv1bis\n" + "line2,hello,all;the;world\n" + "line1,attr2,csv1\n" + ",,\n"; public static final CSVFormat DEFAULT_CSV_FORMAT = CSVFormat.EXCEL.withHeader("key", "attr", "values") .withIgnoreSurroundingSpaces(true); private final CSVFormat format; private final Reader dataSourceStream; private transient MVDataEntry nextEntry; private transient CSVRecord nextCSVRecord; private transient Iterator<CSVRecord> csvIt; /** * Constructs a CSVDataReader object for parsing a CSV input given via dataSourceStream. * @param dataSourceName A short string representing this reader (for logging) * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader * @param alreadySorted If false, memory cost is around 3 times the CSV file size ! * @param format Specify the exact format used to encode the CSV file (separators, escaping...) * @throws IOException */ public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted, CSVFormat format) throws IOException { this.dataSourceName = dataSourceName; this.format = format; if (alreadySorted) { this.dataSourceStream = dataSourceStream; } else { BufferedReader bufReader; if (dataSourceStream instanceof BufferedReader) { bufReader = (BufferedReader) dataSourceStream; } else { bufReader = new BufferedReader(dataSourceStream); } this.dataSourceStream = readAndSortLines(bufReader); } } /** * Constructs a CSVDataReader object with default CSV format (for CSVParser). * @param dataSourceName A short string representing this reader (for logging) * @param dataSourceStream A java.io.Reader from which read the actual CSV data, typically a FileReader * @param alreadySorted If false, memory cost is around 3 times the CSV file size ! * @throws IOException */ public CSVDataReader(String dataSourceName, Reader dataSourceStream, boolean alreadySorted) throws IOException { this(dataSourceName, dataSourceStream, alreadySorted, DEFAULT_CSV_FORMAT); } /** * {@inheritDoc} * Note : multiple iterators on the same instance are not supported */ @Override public Iterator<MVDataEntry> iterator() { // When a new iterator is requested, everything should be reset CSVParser parser; try { dataSourceStream.reset(); parser = new CSVParser(dataSourceStream, format); } catch (IOException e) { throw new RuntimeException(e); } csvIt = parser.iterator(); nextCSVRecord = null; nextEntry = null; return this; } /** * {@inheritDoc} */ @Override public boolean hasNext() { if (nextEntry == null) { lookAhead(); } return (nextEntry != null); } /** * {@inheritDoc} */ @Override public MVDataEntry next() { if (!hasNext()) { throw new NoSuchElementException(); } // Pop the lookahead record MVDataEntry res = nextEntry; nextEntry = null; // And return it return res; } /** * In-memory File sorting, return as a single String * @param reader * @return * @throws IOException */ private Reader readAndSortLines(BufferedReader bufReader) throws IOException { // Put all the CSV in memory, in a SortedSet SortedSet<String> lineSet = new TreeSet<String>(); String inputLine; int totalCSVSize = 0; while ((inputLine = bufReader.readLine()) != null) { lineSet.add(inputLine); totalCSVSize += inputLine.length() + 1; } bufReader.close(); // Closes also dataSourceStream // Put all sorted lines in a String StringBuilder allLines = new StringBuilder(totalCSVSize); for (String line : lineSet) { allLines.append(line + "\n"); } lineSet = null; // Could help the GC if the input file is huge // Build a Java Reader from that String return new StringReader(allLines.toString()); } /** * A MVDataEntry could be represented on many CSV lines. * The key is repeated, the attr could change, the values should change (for given key/attr pair) */ private void lookAhead() { MVDataEntry currEntry = null; boolean abort = (nextCSVRecord == null && !csvIt.hasNext()); // Nothing to crunch boolean done = (nextEntry != null); // Already looked ahead while (!abort && !done) { // Try to get a valid CSVRecord if (nextCSVRecord == null) { nextCSVRecord = nextValidCSVRecord(); } // If no more CSV data if (nextCSVRecord == null) { // Maybe we have a remaining entry to return if (currEntry != null) { done = true; continue; } else { abort = true; continue; } } // Now we have a valid CSV line to put in a MVDataEntry String newKey = nextCSVRecord.get("key"); // If no MVDataEntry yet, it's time to create it (we have data to put into) if (currEntry == null) { currEntry = new MVDataEntry(newKey); } // If CSV line key matches MVDataEntry key, appends attr/values on it // XXX Tricky code : following condition is always true if the previous one is true if (currEntry.getKey().equals(newKey)) { currEntry.splitAndPut(nextCSVRecord.get("attr"), nextCSVRecord.get("values"), ";"); nextCSVRecord = null; // Record consumed } else { // Keys are different, we are done (and we have remaining CSV data in nextCSVRecord) done = true; continue; } } nextEntry = done ? currEntry : null; } /** * Seek for the next valid record in the CSV file * @return the next valid CSVRecord */ private CSVRecord nextValidCSVRecord() { CSVRecord res = null; boolean abort = !csvIt.hasNext(); boolean done = false; //TODO : re-think logic : csvIt will not return nulls but throws IOException wrapped in RuntimeException while (!abort && !done) { // Try to read a CSV line res = (csvIt.hasNext()) ? csvIt.next() : null; // Break if nothing readable if (res == null) { abort = true; continue; } // Skip invalid and empty lines String key = res.get("key"); if (key != null && !key.isEmpty()) { done = true; continue; } } return done ? res : null; } }