Java tutorial
/* * Java CSV is a stream based library for reading and writing * CSV and other delimited data. * * Copyright (C) Bruce Dunwiddie bruce@csvreader.com * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.nio.charset.Charset; import java.text.NumberFormat; import java.util.HashMap; /** * A stream based parser for parsing delimited text data from a file or a * stream. */ public class CsvReader { private Reader inputStream = null; private String fileName = null; // this holds all the values for switches that the user is allowed to set private UserSettings userSettings = new UserSettings(); private Charset charset = null; private boolean useCustomRecordDelimiter = false; // this will be our working buffer to hold data chunks // read in from the data file private DataBuffer dataBuffer = new DataBuffer(); private ColumnBuffer columnBuffer = new ColumnBuffer(); private RawRecordBuffer rawBuffer = new RawRecordBuffer(); private boolean[] isQualified = null; private String rawRecord = ""; private HeadersHolder headersHolder = new HeadersHolder(); // these are all more or less global loop variables // to keep from needing to pass them all into various // methods during parsing private boolean startedColumn = false; private boolean startedWithQualifier = false; private boolean hasMoreData = true; private char lastLetter = '\0'; private boolean hasReadNextLine = false; private int columnsCount = 0; private long currentRecord = 0; private String[] values = new String[StaticSettings.INITIAL_COLUMN_COUNT]; private boolean initialized = false; private boolean closed = false; /** * Double up the text qualifier to represent an occurance of the text * qualifier. */ public static final int ESCAPE_MODE_DOUBLED = 1; /** * Use a backslash character before the text qualifier to represent an * occurance of the text qualifier. */ public static final int ESCAPE_MODE_BACKSLASH = 2; /** * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file * as the data source. * * @param fileName * The path to the file to use as the data source. * @param delimiter * The character to use as the column delimiter. * @param charset * The {@link java.nio.charset.Charset Charset} to use while * parsing the data. */ public CsvReader(String fileName, char delimiter, Charset charset) throws FileNotFoundException { if (fileName == null) { throw new IllegalArgumentException("Parameter fileName can not be null."); } if (charset == null) { throw new IllegalArgumentException("Parameter charset can not be null."); } if (!new File(fileName).exists()) { throw new FileNotFoundException("File " + fileName + " does not exist."); } this.fileName = fileName; this.userSettings.Delimiter = delimiter; this.charset = charset; isQualified = new boolean[values.length]; } /** * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file * as the data source. Uses ISO-8859-1 as the * {@link java.nio.charset.Charset Charset}. * * @param fileName * The path to the file to use as the data source. * @param delimiter * The character to use as the column delimiter. */ public CsvReader(String fileName, char delimiter) throws FileNotFoundException { this(fileName, delimiter, Charset.forName("ISO-8859-1")); } /** * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file * as the data source. Uses a comma as the column delimiter and * ISO-8859-1 as the {@link java.nio.charset.Charset Charset}. * * @param fileName * The path to the file to use as the data source. */ public CsvReader(String fileName) throws FileNotFoundException { this(fileName, Letters.COMMA); } /** * Constructs a {@link com.csvreader.CsvReader CsvReader} object using a * {@link java.io.Reader Reader} object as the data source. * * @param inputStream * The stream to use as the data source. * @param delimiter * The character to use as the column delimiter. */ public CsvReader(Reader inputStream, char delimiter) { if (inputStream == null) { throw new IllegalArgumentException("Parameter inputStream can not be null."); } this.inputStream = inputStream; this.userSettings.Delimiter = delimiter; initialized = true; isQualified = new boolean[values.length]; } /** * Constructs a {@link com.csvreader.CsvReader CsvReader} object using a * {@link java.io.Reader Reader} object as the data source. Uses a * comma as the column delimiter. * * @param inputStream * The stream to use as the data source. */ public CsvReader(Reader inputStream) { this(inputStream, Letters.COMMA); } /** * Constructs a {@link com.csvreader.CsvReader CsvReader} object using an * {@link java.io.InputStream InputStream} object as the data source. * * @param inputStream * The stream to use as the data source. * @param delimiter * The character to use as the column delimiter. * @param charset * The {@link java.nio.charset.Charset Charset} to use while * parsing the data. */ public CsvReader(InputStream inputStream, char delimiter, Charset charset) { this(new InputStreamReader(inputStream, charset), delimiter); } /** * Constructs a {@link com.csvreader.CsvReader CsvReader} object using an * {@link java.io.InputStream InputStream} object as the data * source. Uses a comma as the column delimiter. * * @param inputStream * The stream to use as the data source. * @param charset * The {@link java.nio.charset.Charset Charset} to use while * parsing the data. */ public CsvReader(InputStream inputStream, Charset charset) { this(new InputStreamReader(inputStream, charset)); } public boolean getCaptureRawRecord() { return userSettings.CaptureRawRecord; } public void setCaptureRawRecord(boolean captureRawRecord) { userSettings.CaptureRawRecord = captureRawRecord; } public String getRawRecord() { return rawRecord; } /** * Gets whether leading and trailing whitespace characters are being trimmed * from non-textqualified column data. Default is true. * * @return Whether leading and trailing whitespace characters are being * trimmed from non-textqualified column data. */ public boolean getTrimWhitespace() { return userSettings.TrimWhitespace; } /** * Sets whether leading and trailing whitespace characters should be trimmed * from non-textqualified column data or not. Default is true. * * @param trimWhitespace * Whether leading and trailing whitespace characters should be * trimmed from non-textqualified column data or not. */ public void setTrimWhitespace(boolean trimWhitespace) { userSettings.TrimWhitespace = trimWhitespace; } /** * Gets the character being used as the column delimiter. Default is comma, * ','. * * @return The character being used as the column delimiter. */ public char getDelimiter() { return userSettings.Delimiter; } /** * Sets the character to use as the column delimiter. Default is comma, ','. * * @param delimiter * The character to use as the column delimiter. */ public void setDelimiter(char delimiter) { userSettings.Delimiter = delimiter; } public char getRecordDelimiter() { return userSettings.RecordDelimiter; } /** * Sets the character to use as the record delimiter. * * @param recordDelimiter * The character to use as the record delimiter. Default is * combination of standard end of line characters for Windows, * Unix, or Mac. */ public void setRecordDelimiter(char recordDelimiter) { useCustomRecordDelimiter = true; userSettings.RecordDelimiter = recordDelimiter; } /** * Gets the character to use as a text qualifier in the data. * * @return The character to use as a text qualifier in the data. */ public char getTextQualifier() { return userSettings.TextQualifier; } /** * Sets the character to use as a text qualifier in the data. * * @param textQualifier * The character to use as a text qualifier in the data. */ public void setTextQualifier(char textQualifier) { userSettings.TextQualifier = textQualifier; } /** * Whether text qualifiers will be used while parsing or not. * * @return Whether text qualifiers will be used while parsing or not. */ public boolean getUseTextQualifier() { return userSettings.UseTextQualifier; } /** * Sets whether text qualifiers will be used while parsing or not. * * @param useTextQualifier * Whether to use a text qualifier while parsing or not. */ public void setUseTextQualifier(boolean useTextQualifier) { userSettings.UseTextQualifier = useTextQualifier; } /** * Gets the character being used as a comment signal. * * @return The character being used as a comment signal. */ public char getComment() { return userSettings.Comment; } /** * Sets the character to use as a comment signal. * * @param comment * The character to use as a comment signal. */ public void setComment(char comment) { userSettings.Comment = comment; } /** * Gets whether comments are being looked for while parsing or not. * * @return Whether comments are being looked for while parsing or not. */ public boolean getUseComments() { return userSettings.UseComments; } /** * Sets whether comments are being looked for while parsing or not. * * @param useComments * Whether comments are being looked for while parsing or not. */ public void setUseComments(boolean useComments) { userSettings.UseComments = useComments; } /** * Gets the current way to escape an occurance of the text qualifier inside * qualified data. * * @return The current way to escape an occurance of the text qualifier * inside qualified data. */ public int getEscapeMode() { return userSettings.EscapeMode; } /** * Sets the current way to escape an occurance of the text qualifier inside * qualified data. * * @param escapeMode * The way to escape an occurance of the text qualifier inside * qualified data. * @exception IllegalArgumentException * When an illegal value is specified for escapeMode. */ public void setEscapeMode(int escapeMode) throws IllegalArgumentException { if (escapeMode != ESCAPE_MODE_DOUBLED && escapeMode != ESCAPE_MODE_BACKSLASH) { throw new IllegalArgumentException("Parameter escapeMode must be a valid value."); } userSettings.EscapeMode = escapeMode; } public boolean getSkipEmptyRecords() { return userSettings.SkipEmptyRecords; } public void setSkipEmptyRecords(boolean skipEmptyRecords) { userSettings.SkipEmptyRecords = skipEmptyRecords; } /** * Safety caution to prevent the parser from using large amounts of memory * in the case where parsing settings like file encodings don't end up * matching the actual format of a file. This switch can be turned off if * the file format is known and tested. With the switch off, the max column * lengths and max column count per record supported by the parser will * greatly increase. Default is true. * * @return The current setting of the safety switch. */ public boolean getSafetySwitch() { return userSettings.SafetySwitch; } /** * Safety caution to prevent the parser from using large amounts of memory * in the case where parsing settings like file encodings don't end up * matching the actual format of a file. This switch can be turned off if * the file format is known and tested. With the switch off, the max column * lengths and max column count per record supported by the parser will * greatly increase. Default is true. * * @param safetySwitch */ public void setSafetySwitch(boolean safetySwitch) { userSettings.SafetySwitch = safetySwitch; } /** * Gets the count of columns found in this record. * * @return The count of columns found in this record. */ public int getColumnCount() { return columnsCount; } /** * Gets the index of the current record. * * @return The index of the current record. */ public long getCurrentRecord() { return currentRecord - 1; } /** * Gets the count of headers read in by a previous call to * {@link com.csvreader.CsvReader#readHeaders readHeaders()}. * * @return The count of headers read in by a previous call to * {@link com.csvreader.CsvReader#readHeaders readHeaders()}. */ public int getHeaderCount() { return headersHolder.Length; } /** * Returns the header values as a string array. * * @return The header values as a String array. * @exception IOException * Thrown if this object has already been closed. */ public String[] getHeaders() throws IOException { checkClosed(); if (headersHolder.Headers == null) { return null; } else { // use clone here to prevent the outside code from // setting values on the array directly, which would // throw off the index lookup based on header name String[] clone = new String[headersHolder.Length]; System.arraycopy(headersHolder.Headers, 0, clone, 0, headersHolder.Length); return clone; } } public void setHeaders(String[] headers) { headersHolder.Headers = headers; headersHolder.IndexByName.clear(); if (headers != null) { headersHolder.Length = headers.length; } else { headersHolder.Length = 0; } // use headersHolder.Length here in case headers is null for (int i = 0; i < headersHolder.Length; i++) { headersHolder.IndexByName.put(headers[i], Integer.valueOf(i)); } } public String[] getValues() throws IOException { checkClosed(); // need to return a clone, and can't use clone because values.Length // might be greater than columnsCount String[] clone = new String[columnsCount]; System.arraycopy(values, 0, clone, 0, columnsCount); return clone; } /** * Returns the current column value for a given column index. * * @param columnIndex * The index of the column. * @return The current column value. * @exception IOException * Thrown if this object has already been closed. */ public String get(int columnIndex) throws IOException { checkClosed(); if (columnIndex > -1 && columnIndex < columnsCount) { return values[columnIndex]; } else { return ""; } } /** * Returns the current column value for a given column header name. * * @param headerName * The header name of the column. * @return The current column value. * @exception IOException * Thrown if this object has already been closed. */ public String get(String headerName) throws IOException { checkClosed(); return get(getIndex(headerName)); } /** * Creates a {@link com.csvreader.CsvReader CsvReader} object using a string * of data as the source. Uses ISO-8859-1 as the * {@link java.nio.charset.Charset Charset}. * * @param data * The String of data to use as the source. * @return A {@link com.csvreader.CsvReader CsvReader} object using the * String of data as the source. */ public static CsvReader parse(String data) { if (data == null) { throw new IllegalArgumentException("Parameter data can not be null."); } return new CsvReader(new StringReader(data)); } /** * Reads another record. * * @return Whether another record was successfully read or not. * @exception IOException * Thrown if an error occurs while reading data from the * source stream. */ public boolean readRecord() throws IOException { checkClosed(); columnsCount = 0; rawBuffer.Position = 0; dataBuffer.LineStart = dataBuffer.Position; hasReadNextLine = false; // check to see if we've already found the end of data if (hasMoreData) { // loop over the data stream until the end of data is found // or the end of the record is found do { if (dataBuffer.Position == dataBuffer.Count) { checkDataLength(); } else { startedWithQualifier = false; // grab the current letter as a char char currentLetter = dataBuffer.Buffer[dataBuffer.Position]; if (userSettings.UseTextQualifier && currentLetter == userSettings.TextQualifier) { // this will be a text qualified column, so // we need to set startedWithQualifier to make it // enter the seperate branch to handle text // qualified columns lastLetter = currentLetter; // read qualified startedColumn = true; dataBuffer.ColumnStart = dataBuffer.Position + 1; startedWithQualifier = true; boolean lastLetterWasQualifier = false; char escapeChar = userSettings.TextQualifier; if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH) { escapeChar = Letters.BACKSLASH; } boolean eatingTrailingJunk = false; boolean lastLetterWasEscape = false; boolean readingComplexEscape = false; int escape = ComplexEscape.UNICODE; int escapeLength = 0; char escapeValue = (char) 0; dataBuffer.Position++; do { if (dataBuffer.Position == dataBuffer.Count) { checkDataLength(); } else { // grab the current letter as a char currentLetter = dataBuffer.Buffer[dataBuffer.Position]; if (eatingTrailingJunk) { dataBuffer.ColumnStart = dataBuffer.Position + 1; if (currentLetter == userSettings.Delimiter) { endColumn(); } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF)) || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) { endColumn(); endRecord(); } } else if (readingComplexEscape) { escapeLength++; switch (escape) { case ComplexEscape.UNICODE: escapeValue *= (char) 16; escapeValue += hexToDec(currentLetter); if (escapeLength == 4) { readingComplexEscape = false; } break; case ComplexEscape.OCTAL: escapeValue *= (char) 8; escapeValue += (char) (currentLetter - '0'); if (escapeLength == 3) { readingComplexEscape = false; } break; case ComplexEscape.DECIMAL: escapeValue *= (char) 10; escapeValue += (char) (currentLetter - '0'); if (escapeLength == 3) { readingComplexEscape = false; } break; case ComplexEscape.HEX: escapeValue *= (char) 16; escapeValue += hexToDec(currentLetter); if (escapeLength == 2) { readingComplexEscape = false; } break; } if (!readingComplexEscape) { appendLetter(escapeValue); } else { dataBuffer.ColumnStart = dataBuffer.Position + 1; } } else if (currentLetter == userSettings.TextQualifier) { if (lastLetterWasEscape) { lastLetterWasEscape = false; lastLetterWasQualifier = false; } else { updateCurrentValue(); if (userSettings.EscapeMode == ESCAPE_MODE_DOUBLED) { lastLetterWasEscape = true; } lastLetterWasQualifier = true; } } else if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH && lastLetterWasEscape) { switch (currentLetter) { case 'n': appendLetter(Letters.LF); break; case 'r': appendLetter(Letters.CR); break; case 't': appendLetter(Letters.TAB); break; case 'b': appendLetter(Letters.BACKSPACE); break; case 'f': appendLetter(Letters.FORM_FEED); break; case 'e': appendLetter(Letters.ESCAPE); break; case 'v': appendLetter(Letters.VERTICAL_TAB); break; case 'a': appendLetter(Letters.ALERT); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': escape = ComplexEscape.OCTAL; readingComplexEscape = true; escapeLength = 1; escapeValue = (char) (currentLetter - '0'); dataBuffer.ColumnStart = dataBuffer.Position + 1; break; case 'u': case 'x': case 'o': case 'd': case 'U': case 'X': case 'O': case 'D': switch (currentLetter) { case 'u': case 'U': escape = ComplexEscape.UNICODE; break; case 'x': case 'X': escape = ComplexEscape.HEX; break; case 'o': case 'O': escape = ComplexEscape.OCTAL; break; case 'd': case 'D': escape = ComplexEscape.DECIMAL; break; } readingComplexEscape = true; escapeLength = 0; escapeValue = (char) 0; dataBuffer.ColumnStart = dataBuffer.Position + 1; break; default: break; } lastLetterWasEscape = false; // can only happen for ESCAPE_MODE_BACKSLASH } else if (currentLetter == escapeChar) { updateCurrentValue(); lastLetterWasEscape = true; } else { if (lastLetterWasQualifier) { if (currentLetter == userSettings.Delimiter) { endColumn(); } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF)) || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) { endColumn(); endRecord(); } else { dataBuffer.ColumnStart = dataBuffer.Position + 1; eatingTrailingJunk = true; } // make sure to clear the flag for next // run of the loop lastLetterWasQualifier = false; } } // keep track of the last letter because we need // it for several key decisions lastLetter = currentLetter; if (startedColumn) { dataBuffer.Position++; if (userSettings.SafetySwitch && dataBuffer.Position - dataBuffer.ColumnStart + columnBuffer.Position > 100000) { close(); throw new IOException("Maximum column length of 100,000 exceeded in column " + NumberFormat.getIntegerInstance().format(columnsCount) + " in record " + NumberFormat.getIntegerInstance().format(currentRecord) + ". Set the SafetySwitch property to false" + " if you're expecting column lengths greater than 100,000 characters to" + " avoid this error."); } } } // end else } while (hasMoreData && startedColumn); } else if (currentLetter == userSettings.Delimiter) { // we encountered a column with no data, so // just send the end column lastLetter = currentLetter; endColumn(); } else if (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter) { // this will skip blank lines if (startedColumn || columnsCount > 0 || !userSettings.SkipEmptyRecords) { endColumn(); endRecord(); } else { dataBuffer.LineStart = dataBuffer.Position + 1; } lastLetter = currentLetter; } else if (!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF)) { // this will skip blank lines if (startedColumn || columnsCount > 0 || (!userSettings.SkipEmptyRecords && (currentLetter == Letters.CR || lastLetter != Letters.CR))) { endColumn(); endRecord(); } else { dataBuffer.LineStart = dataBuffer.Position + 1; } lastLetter = currentLetter; } else if (userSettings.UseComments && columnsCount == 0 && currentLetter == userSettings.Comment) { // encountered a comment character at the beginning of // the line so just ignore the rest of the line lastLetter = currentLetter; skipLine(); } else if (userSettings.TrimWhitespace && (currentLetter == Letters.SPACE || currentLetter == Letters.TAB)) { // do nothing, this will trim leading whitespace // for both text qualified columns and non startedColumn = true; dataBuffer.ColumnStart = dataBuffer.Position + 1; } else { // since the letter wasn't a special letter, this // will be the first letter of our current column startedColumn = true; dataBuffer.ColumnStart = dataBuffer.Position; boolean lastLetterWasBackslash = false; boolean readingComplexEscape = false; int escape = ComplexEscape.UNICODE; int escapeLength = 0; char escapeValue = (char) 0; boolean firstLoop = true; do { if (!firstLoop && dataBuffer.Position == dataBuffer.Count) { checkDataLength(); } else { if (!firstLoop) { // grab the current letter as a char currentLetter = dataBuffer.Buffer[dataBuffer.Position]; } if (!userSettings.UseTextQualifier && userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH && currentLetter == Letters.BACKSLASH) { if (lastLetterWasBackslash) { lastLetterWasBackslash = false; } else { updateCurrentValue(); lastLetterWasBackslash = true; } } else if (readingComplexEscape) { escapeLength++; switch (escape) { case ComplexEscape.UNICODE: escapeValue *= (char) 16; escapeValue += hexToDec(currentLetter); if (escapeLength == 4) { readingComplexEscape = false; } break; case ComplexEscape.OCTAL: escapeValue *= (char) 8; escapeValue += (char) (currentLetter - '0'); if (escapeLength == 3) { readingComplexEscape = false; } break; case ComplexEscape.DECIMAL: escapeValue *= (char) 10; escapeValue += (char) (currentLetter - '0'); if (escapeLength == 3) { readingComplexEscape = false; } break; case ComplexEscape.HEX: escapeValue *= (char) 16; escapeValue += hexToDec(currentLetter); if (escapeLength == 2) { readingComplexEscape = false; } break; } if (!readingComplexEscape) { appendLetter(escapeValue); } else { dataBuffer.ColumnStart = dataBuffer.Position + 1; } } else if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH && lastLetterWasBackslash) { switch (currentLetter) { case 'n': appendLetter(Letters.LF); break; case 'r': appendLetter(Letters.CR); break; case 't': appendLetter(Letters.TAB); break; case 'b': appendLetter(Letters.BACKSPACE); break; case 'f': appendLetter(Letters.FORM_FEED); break; case 'e': appendLetter(Letters.ESCAPE); break; case 'v': appendLetter(Letters.VERTICAL_TAB); break; case 'a': appendLetter(Letters.ALERT); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': escape = ComplexEscape.OCTAL; readingComplexEscape = true; escapeLength = 1; escapeValue = (char) (currentLetter - '0'); dataBuffer.ColumnStart = dataBuffer.Position + 1; break; case 'u': case 'x': case 'o': case 'd': case 'U': case 'X': case 'O': case 'D': switch (currentLetter) { case 'u': case 'U': escape = ComplexEscape.UNICODE; break; case 'x': case 'X': escape = ComplexEscape.HEX; break; case 'o': case 'O': escape = ComplexEscape.OCTAL; break; case 'd': case 'D': escape = ComplexEscape.DECIMAL; break; } readingComplexEscape = true; escapeLength = 0; escapeValue = (char) 0; dataBuffer.ColumnStart = dataBuffer.Position + 1; break; default: break; } lastLetterWasBackslash = false; } else { if (currentLetter == userSettings.Delimiter) { endColumn(); } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF)) || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) { endColumn(); endRecord(); } } // keep track of the last letter because we need // it for several key decisions lastLetter = currentLetter; firstLoop = false; if (startedColumn) { dataBuffer.Position++; if (userSettings.SafetySwitch && dataBuffer.Position - dataBuffer.ColumnStart + columnBuffer.Position > 100000) { close(); throw new IOException("Maximum column length of 100,000 exceeded in column " + NumberFormat.getIntegerInstance().format(columnsCount) + " in record " + NumberFormat.getIntegerInstance().format(currentRecord) + ". Set the SafetySwitch property to false" + " if you're expecting column lengths greater than 100,000 characters to" + " avoid this error."); } } } // end else } while (hasMoreData && startedColumn); } if (hasMoreData) { dataBuffer.Position++; } } // end else } while (hasMoreData && !hasReadNextLine); // check to see if we hit the end of the file // without processing the current record if (startedColumn || lastLetter == userSettings.Delimiter) { endColumn(); endRecord(); } } if (userSettings.CaptureRawRecord) { if (hasMoreData) { if (rawBuffer.Position == 0) { rawRecord = new String(dataBuffer.Buffer, dataBuffer.LineStart, dataBuffer.Position - dataBuffer.LineStart - 1); } else { rawRecord = new String(rawBuffer.Buffer, 0, rawBuffer.Position) + new String(dataBuffer.Buffer, dataBuffer.LineStart, dataBuffer.Position - dataBuffer.LineStart - 1); } } else { // for hasMoreData to ever be false, all data would have had to // have been // copied to the raw buffer rawRecord = new String(rawBuffer.Buffer, 0, rawBuffer.Position); } } else { rawRecord = ""; } return hasReadNextLine; } /** * @exception IOException * Thrown if an error occurs while reading data from the * source stream. */ private void checkDataLength() throws IOException { if (!initialized) { if (fileName != null) { inputStream = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charset), StaticSettings.MAX_FILE_BUFFER_SIZE); } charset = null; initialized = true; } updateCurrentValue(); if (userSettings.CaptureRawRecord && dataBuffer.Count > 0) { if (rawBuffer.Buffer.length - rawBuffer.Position < dataBuffer.Count - dataBuffer.LineStart) { int newLength = rawBuffer.Buffer.length + Math.max(dataBuffer.Count - dataBuffer.LineStart, rawBuffer.Buffer.length); char[] holder = new char[newLength]; System.arraycopy(rawBuffer.Buffer, 0, holder, 0, rawBuffer.Position); rawBuffer.Buffer = holder; } System.arraycopy(dataBuffer.Buffer, dataBuffer.LineStart, rawBuffer.Buffer, rawBuffer.Position, dataBuffer.Count - dataBuffer.LineStart); rawBuffer.Position += dataBuffer.Count - dataBuffer.LineStart; } try { dataBuffer.Count = inputStream.read(dataBuffer.Buffer, 0, dataBuffer.Buffer.length); } catch (IOException ex) { close(); throw ex; } // if no more data could be found, set flag stating that // the end of the data was found if (dataBuffer.Count == -1) { hasMoreData = false; } dataBuffer.Position = 0; dataBuffer.LineStart = 0; dataBuffer.ColumnStart = 0; } /** * Read the first record of data as column headers. * * @return Whether the header record was successfully read or not. * @exception IOException * Thrown if an error occurs while reading data from the * source stream. */ public boolean readHeaders() throws IOException { boolean result = readRecord(); // copy the header data from the column array // to the header string array headersHolder.Length = columnsCount; headersHolder.Headers = new String[columnsCount]; for (int i = 0; i < headersHolder.Length; i++) { String columnValue = get(i); headersHolder.Headers[i] = columnValue; // if there are duplicate header names, we will save the last one headersHolder.IndexByName.put(columnValue, Integer.valueOf(i)); } if (result) { currentRecord--; } columnsCount = 0; return result; } /** * Returns the column header value for a given column index. * * @param columnIndex * The index of the header column being requested. * @return The value of the column header at the given column index. * @exception IOException * Thrown if this object has already been closed. */ public String getHeader(int columnIndex) throws IOException { checkClosed(); // check to see if we have read the header record yet // check to see if the column index is within the bounds // of our header array if (columnIndex > -1 && columnIndex < headersHolder.Length) { // return the processed header data for this column return headersHolder.Headers[columnIndex]; } else { return ""; } } public boolean isQualified(int columnIndex) throws IOException { checkClosed(); if (columnIndex < columnsCount && columnIndex > -1) { return isQualified[columnIndex]; } else { return false; } } /** * @exception IOException * Thrown if a very rare extreme exception occurs during * parsing, normally resulting from improper data format. */ private void endColumn() throws IOException { String currentValue = ""; // must be called before setting startedColumn = false if (startedColumn) { if (columnBuffer.Position == 0) { if (dataBuffer.ColumnStart < dataBuffer.Position) { int lastLetter = dataBuffer.Position - 1; if (userSettings.TrimWhitespace && !startedWithQualifier) { while (lastLetter >= dataBuffer.ColumnStart && (dataBuffer.Buffer[lastLetter] == Letters.SPACE || dataBuffer.Buffer[lastLetter] == Letters.TAB)) { lastLetter--; } } currentValue = new String(dataBuffer.Buffer, dataBuffer.ColumnStart, lastLetter - dataBuffer.ColumnStart + 1); } } else { updateCurrentValue(); int lastLetter = columnBuffer.Position - 1; if (userSettings.TrimWhitespace && !startedWithQualifier) { while (lastLetter >= 0 && (columnBuffer.Buffer[lastLetter] == Letters.SPACE || columnBuffer.Buffer[lastLetter] == Letters.SPACE)) { lastLetter--; } } currentValue = new String(columnBuffer.Buffer, 0, lastLetter + 1); } } columnBuffer.Position = 0; startedColumn = false; if (columnsCount >= 100000 && userSettings.SafetySwitch) { close(); throw new IOException("Maximum column count of 100,000 exceeded in record " + NumberFormat.getIntegerInstance().format(currentRecord) + ". Set the SafetySwitch property to false" + " if you're expecting more than 100,000 columns per record to" + " avoid this error."); } // check to see if our current holder array for // column chunks is still big enough to handle another // column chunk if (columnsCount == values.length) { // holder array needs to grow to be able to hold another column int newLength = values.length * 2; String[] holder = new String[newLength]; System.arraycopy(values, 0, holder, 0, values.length); values = holder; boolean[] qualifiedHolder = new boolean[newLength]; System.arraycopy(isQualified, 0, qualifiedHolder, 0, isQualified.length); isQualified = qualifiedHolder; } values[columnsCount] = currentValue; isQualified[columnsCount] = startedWithQualifier; currentValue = ""; columnsCount++; } private void appendLetter(char letter) { if (columnBuffer.Position == columnBuffer.Buffer.length) { int newLength = columnBuffer.Buffer.length * 2; char[] holder = new char[newLength]; System.arraycopy(columnBuffer.Buffer, 0, holder, 0, columnBuffer.Position); columnBuffer.Buffer = holder; } columnBuffer.Buffer[columnBuffer.Position++] = letter; dataBuffer.ColumnStart = dataBuffer.Position + 1; } private void updateCurrentValue() { if (startedColumn && dataBuffer.ColumnStart < dataBuffer.Position) { if (columnBuffer.Buffer.length - columnBuffer.Position < dataBuffer.Position - dataBuffer.ColumnStart) { int newLength = columnBuffer.Buffer.length + Math.max(dataBuffer.Position - dataBuffer.ColumnStart, columnBuffer.Buffer.length); char[] holder = new char[newLength]; System.arraycopy(columnBuffer.Buffer, 0, holder, 0, columnBuffer.Position); columnBuffer.Buffer = holder; } System.arraycopy(dataBuffer.Buffer, dataBuffer.ColumnStart, columnBuffer.Buffer, columnBuffer.Position, dataBuffer.Position - dataBuffer.ColumnStart); columnBuffer.Position += dataBuffer.Position - dataBuffer.ColumnStart; } dataBuffer.ColumnStart = dataBuffer.Position + 1; } /** * @exception IOException * Thrown if an error occurs while reading data from the * source stream. */ private void endRecord() throws IOException { // this flag is used as a loop exit condition // during parsing hasReadNextLine = true; currentRecord++; } /** * Gets the corresponding column index for a given column header name. * * @param headerName * The header name of the column. * @return The column index for the given column header name. Returns * -1 if not found. * @exception IOException * Thrown if this object has already been closed. */ public int getIndex(String headerName) throws IOException { checkClosed(); Integer indexValue = headersHolder.IndexByName.get(headerName); if (indexValue != null) { return indexValue.intValue(); } else { return -1; } } /** * Skips the next record of data by parsing each column. Does not * increment * {@link com.csvreader.CsvReader#getCurrentRecord getCurrentRecord()}. * * @return Whether another record was successfully skipped or not. * @exception IOException * Thrown if an error occurs while reading data from the * source stream. */ public boolean skipRecord() throws IOException { checkClosed(); boolean recordRead = false; if (hasMoreData) { recordRead = readRecord(); if (recordRead) { currentRecord--; } } return recordRead; } /** * Skips the next line of data using the standard end of line characters and * does not do any column delimited parsing. * * @return Whether a line was successfully skipped or not. * @exception IOException * Thrown if an error occurs while reading data from the * source stream. */ public boolean skipLine() throws IOException { checkClosed(); // clear public column values for current line columnsCount = 0; boolean skippedLine = false; if (hasMoreData) { boolean foundEol = false; do { if (dataBuffer.Position == dataBuffer.Count) { checkDataLength(); } else { skippedLine = true; // grab the current letter as a char char currentLetter = dataBuffer.Buffer[dataBuffer.Position]; if (currentLetter == Letters.CR || currentLetter == Letters.LF) { foundEol = true; } // keep track of the last letter because we need // it for several key decisions lastLetter = currentLetter; if (!foundEol) { dataBuffer.Position++; } } // end else } while (hasMoreData && !foundEol); columnBuffer.Position = 0; dataBuffer.LineStart = dataBuffer.Position + 1; } rawBuffer.Position = 0; rawRecord = ""; return skippedLine; } /** * Closes and releases all related resources. */ public void close() { if (!closed) { close(true); closed = true; } } /** * */ private void close(boolean closing) { if (!closed) { if (closing) { charset = null; headersHolder.Headers = null; headersHolder.IndexByName = null; dataBuffer.Buffer = null; columnBuffer.Buffer = null; rawBuffer.Buffer = null; } try { if (initialized) { inputStream.close(); } } catch (Exception e) { // just eat the exception } inputStream = null; closed = true; } } /** * @exception IOException * Thrown if this object has already been closed. */ private void checkClosed() throws IOException { if (closed) { throw new IOException("This instance of the CsvReader class has already been closed."); } } /** * */ protected void finalize() { close(false); } private class ComplexEscape { private static final int UNICODE = 1; private static final int OCTAL = 2; private static final int DECIMAL = 3; private static final int HEX = 4; } private static char hexToDec(char hex) { char result; if (hex >= 'a') { result = (char) (hex - 'a' + 10); } else if (hex >= 'A') { result = (char) (hex - 'A' + 10); } else { result = (char) (hex - '0'); } return result; } private class DataBuffer { public char[] Buffer; public int Position; // / <summary> // / How much usable data has been read into the stream, // / which will not always be as long as Buffer.Length. // / </summary> public int Count; // / <summary> // / The position of the cursor in the buffer when the // / current column was started or the last time data // / was moved out to the column buffer. // / </summary> public int ColumnStart; public int LineStart; public DataBuffer() { Buffer = new char[StaticSettings.MAX_BUFFER_SIZE]; Position = 0; Count = 0; ColumnStart = 0; LineStart = 0; } } private class ColumnBuffer { public char[] Buffer; public int Position; public ColumnBuffer() { Buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE]; Position = 0; } } private class RawRecordBuffer { public char[] Buffer; public int Position; public RawRecordBuffer() { Buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE * StaticSettings.INITIAL_COLUMN_COUNT]; Position = 0; } } private class Letters { public static final char LF = '\n'; public static final char CR = '\r'; public static final char QUOTE = '"'; public static final char COMMA = ','; public static final char SPACE = ' '; public static final char TAB = '\t'; public static final char POUND = '#'; public static final char BACKSLASH = '\\'; public static final char NULL = '\0'; public static final char BACKSPACE = '\b'; public static final char FORM_FEED = '\f'; public static final char ESCAPE = '\u001B'; // ASCII/ANSI escape public static final char VERTICAL_TAB = '\u000B'; public static final char ALERT = '\u0007'; } private class UserSettings { // having these as publicly accessible members will prevent // the overhead of the method call that exists on properties public boolean CaseSensitive; public char TextQualifier; public boolean TrimWhitespace; public boolean UseTextQualifier; public char Delimiter; public char RecordDelimiter; public char Comment; public boolean UseComments; public int EscapeMode; public boolean SafetySwitch; public boolean SkipEmptyRecords; public boolean CaptureRawRecord; public UserSettings() { CaseSensitive = true; TextQualifier = Letters.QUOTE; TrimWhitespace = true; UseTextQualifier = true; Delimiter = Letters.COMMA; RecordDelimiter = Letters.NULL; Comment = Letters.POUND; UseComments = false; EscapeMode = CsvReader.ESCAPE_MODE_DOUBLED; SafetySwitch = true; SkipEmptyRecords = true; CaptureRawRecord = true; } } private class HeadersHolder { public String[] Headers; public int Length; public HashMap<String, Integer> IndexByName; public HeadersHolder() { Headers = null; Length = 0; IndexByName = new HashMap<String, Integer>(); } } private class StaticSettings { // these are static instead of final so they can be changed in unit test // isn't visible outside this class and is only accessed once during // CsvReader construction public static final int MAX_BUFFER_SIZE = 1024; public static final int MAX_FILE_BUFFER_SIZE = 4 * 1024; public static final int INITIAL_COLUMN_COUNT = 10; public static final int INITIAL_COLUMN_BUFFER_SIZE = 50; } }