org.ecoinformatics.datamanager.database.DelimitedReader.java Source code

Introduction

Here is the source code for org.ecoinformatics.datamanager.database.DelimitedReader.java
Source

/**
 *    '$RCSfile: DelimitedReader.java,v $'
 *
 *     '$Author: leinfelder $'
 *       '$Date: 2008-02-29 23:23:36 $'
 *   '$Revision: 1.9 $'
 *
 *  For Details: http://kepler.ecoinformatics.org
 *
 * Copyright (c) 2003 The Regents of the University of California.
 * All rights reserved.
 * 
 * Permission is hereby granted, without written agreement and without
 * license or royalty fees, to use, copy, modify, and distribute this
 * software and its documentation for any purpose, provided that the
 * above copyright notice and the following two paragraphs appear in
 * all copies of this software.
 * 
 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
 * IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
 * OF SUCH DAMAGE.
 * 
 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
 * OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT,
 * UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 */

package org.ecoinformatics.datamanager.database;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Vector;

//import org.apache.commons.logging.Log;
//import org.apache.commons.logging.LogFactory;

/**
 * tokenizes a delimited file.  This reader assumes that one record is on one
 * line which ends with the line
 */
public class DelimitedReader extends TextDataReader {

    /*
     * Instance fields
     */

    private String data;
    private InputStreamReader dataReader;
    private Vector[] lines;
    private Vector linesVector;
    private int numHeaderLines;
    private int numRecords;
    private boolean stripHeader = false;
    //private int currentRecord = 0;
    private int numCols;
    private String delimiter;
    private String lineEnding;
    private boolean collapseDelimiters = false;
    private int numFooterLines = 0;
    private Vector footerBuffer = new Vector();
    private boolean initializedFooterBuffer = false;
    private int headLineNumberCount = 0;
    private String quoteCharacter = null;
    private String literalCharacter = null;
    private boolean includeLiteralCharacter = false;

    /*private static Log log;
    static {
       log = LogFactory.getLog("org.ecoinformatics.util.DelimitedReader");
    }*/

    /*
     * Constructors
     */

    /**
     * Constructor. Reads the csv (comma-separated values) stream.
     * 
     * @param data           the delimited stream to read as a string
     * @param numCols        the number of columns in the stream
     * @param delimiter      the delimiter string to tokenize on
     * @param numHeaderLines the number of lines to skip at the top of the file
     * @param lineEnding     the line ending char(s)...either "\n" (Unix),
     *                       "\r\n" (Windows) or "\r" (Mac)
     * @param numRecords     the number of rows in the data string
     */
    public DelimitedReader(String data, int numCols, String delimiter, int numHeaderLines, String lineEnding,
            int numRecords) throws Exception {
        this.numHeaderLines = numHeaderLines;
        this.data = data;
        this.numCols = numCols;
        this.numRecords = numRecords;
        //log.debug("Delimiter is: " + delimiter);
        this.delimiter = unescapeDelimiter(delimiter);
        //log.debug("LineEnding is: " + lineEnding);
        this.lineEnding = unescapeDelimiter(lineEnding);

        //lines = new Vector[numRecords + numHeaderLines + 1];
        linesVector = new Vector();

        int begin = 0;
        int end = 0;
        //    int i = 0;

        while (end < data.length()) { //add each line of the string as an element in a vector
            end = data.indexOf(this.lineEnding, begin); //DFH 'this.' added

            if (end == -1) {
                end = data.length();
            }

            String line = data.substring(begin, end);

            if (!line.trim().equals("")) {
                //take off the line ending
                // MBJ: I commented out the next line as it was improperly truncating 
                // lines. I'm not sure why it was there in the first place, as the 
                // previous substring removed the delimiter
                //line = line.substring(0, line.length() - lineEnding.length());

                //split the line based on the delimiter
                Vector v = splitDelimitedRowStringIntoVector(line);
                /*String[] s = line.split(delimiter.trim(), numCols);
                Vector v = new Vector();
                    
                for(int j=0; j<s.length; j++)
                {
                  v.addElement(s[j]);
                }
                    
                if(v.size()  < numCols)
                {
                  int vsize = v.size();
                  for(int j=0; j<numCols - vsize; j++)
                  { //add any elements that aren't there so that all the records have the
                    //same number of cols
                    v.addElement("");
                  }
                }*/

                //lines[i] = v;
                linesVector.add(v);
                //        i++;
            }

            //go to the next line
            begin = end + this.lineEnding.length(); //DFH  'this.' added
        }

        int records = linesVector.size();

        if (records != this.numRecords) {
            this.numRecords = records;
            //log.warn("Metadata disagrees with actual data. Changing number of records to: " + records);
        }

        lines = new Vector[records];

        for (int k = 0; k < records; k++) {
            lines[k] = (Vector) linesVector.get(k);
        }
        /*
            for(int j=0; j<lines.length; j++)
            {
              if(lines[j] == null)
              {
                lines[j] = new Vector();
              }
            }
        */

    }

    /**
     * This constructor will read delimited data from stream rather a string.
     * 
     * @param dataStream     InputStream  The input stream
     * @param numCols        int the number of columns
     * @param delimiter      String the delimiter to tokenize on
     * @param numHeaderLines int numHeaderLines the number of lines to skip at the
     *                       top of the file
     * @param lineEnding     String lineEnding the line ending char(s)...either 
     *                       "\n" (Unix),"\r\n" (Windows) or "\r" (Mac)
     * @param numRecords     int number of rows in the input stream
     */
    public DelimitedReader(InputStream dataStream, int numCols, String delimiter, int numHeaderLines,
            String lineEnding, int numRecords, boolean stripHeader) {
        this.dataReader = new InputStreamReader(dataStream);
        this.numHeaderLines = numHeaderLines;
        this.numCols = numCols;
        this.numRecords = numRecords;
        //log.debug("Delimiter is: " + delimiter);
        this.delimiter = unescapeDelimiter(delimiter);
        //log.debug("LineEnding is: " + lineEnding);
        this.lineEnding = unescapeDelimiter(lineEnding);
        this.stripHeader = stripHeader;

    }

    /*
     * Class methods
     */

    /**
     * Convert a string-escaped representation of a delimiter character into
     * an actual String for that delimiter.  This is used for translating
     * escaped versions of tab, newline, and carriage return characters to
     * their real character values.
     * 
     * @param delimiter the String representing the delimiter
     * @return the actual String for the delimiter
     */
    public static String unescapeDelimiter(String delimiter) {
        String newDelimiter = delimiter;

        if (delimiter == null) {
            //log.debug("Delimiter is null and we set up to \n.");
            newDelimiter = "\n";
        } else if (delimiter.equals("\\t")) {
            //log.debug("Tab interpreted incorrectly as string.");
            newDelimiter = "\t";
        } else if (delimiter.equals("\\n")) {
            //log.debug("Newline interpreted incorrectly as string.");
            newDelimiter = "\n";
        } else if (delimiter.equals("\\r")) {
            //log.debug("CR interpreted incorrectly as string.");
            newDelimiter = "\r";
        } else if (delimiter.equals("\\r\\n")) {
            //log.debug("CRNL interpreted incorrectly as string.");
            newDelimiter = "\r\n";
        } else if (delimiter.startsWith("#")) {
            //log.debug("XML entity charactor.");
            String digits = delimiter.substring(1, delimiter.length());
            int radix = 10;
            if (digits.startsWith("x")) {
                //log.debug("Radix is "+ 16);
                radix = 16;
                digits = digits.substring(1, digits.length());
            }
            //log.debug("Int value of  delimiter is "+digits);

            newDelimiter = transferDigitsToCharString(radix, digits);

        } else if (delimiter.startsWith("0x") || delimiter.startsWith("0X")) {
            int radix = 16;
            String digits = delimiter.substring(2, delimiter.length());
            //log.debug("Int value of  delimiter is "+digits);
            newDelimiter = transferDigitsToCharString(radix, digits);
        }

        return newDelimiter;
    }

    /**
     * Auxiliary method called by unescapeDelimiter(). Transforms digits for a 
     * given radix into the equivalent character value.
     * 
     * @param radix        the radix value, e.g. 8 or 16
     * @param digits       a string holding the digits to be transformed
     * @return             a string holiding the equivalent character value
     */
    private static String transferDigitsToCharString(int radix, String digits) {
        if (digits == null) {
            return null;
        }
        Integer integer = Integer.valueOf(digits, radix);
        int inter = integer.intValue();
        //log.debug("The decimal value of char is "+ inter);
        char charactor = (char) inter;
        String newDelimiter = Character.toString(charactor);
        //log.debug("The new delimter is "+newDelimiter);
        return newDelimiter;
    }

    /*
     * Instance methods
     */

    /**
     * Method to set up data stream as source
     * 
     * @param dataStream InputStream
     */
    public void setInputStream(InputStream dataStream) {
        this.dataReader = new InputStreamReader(dataStream);
    }

    /**
     * Method to set collapseDelimiters boolean value.
     * 
     * @param collapseDelimiters  if true, consecutive delimiters are collapsed
     *                            into a single delimiter
     */
    public void setCollapseDelimiters(boolean collapseDelimiters) {
        this.collapseDelimiters = collapseDelimiters;
    }

    /**
     * Set up the footer line number.
     * 
     * @param numFooterLines
     */
    public void setNumFooterLines(int numFooterLines) {
        this.numFooterLines = numFooterLines;
    }

    /**
     * Set quote character for this reader
     * @param quoteCharacter  the quote chacater value
     */
    public void setQuoteCharacter(String quoteCharacter) {
        this.quoteCharacter = quoteCharacter;
    }

    /**
     * Set literal character for this reader
     * @param literalCharacter  the literal character value
     */
    public void setLiteralCharacter(String literalCharacter) {
        this.literalCharacter = literalCharacter;
    }

    /**
     * This method is used when data source is an input stream.
     * Reads one row from the input stream and returns a data vector where element 
     * is String and the value is field data. After reaching the end of stream, 
     * empty vector will be returned. So this method can be iterated by a while 
     * loop until a empty vector is hit. During the iteration, all data in the 
     * stream will be pulled out.
     * 
     * @return Vector
     */
    public Vector getOneRowDataVector() throws Exception {
        //System.out.println("the numFootLines is "+numFooterLines);
        if (!initializedFooterBuffer) {
            for (int i = 0; i < numFooterLines; i++) {
                //System.out.println("the initialize with footer lines");
                String rowData = readOneRowDataString();
                //System.out.println("the data vector in initailize is "+rowData.toString());
                footerBuffer.add(rowData);
            }

            // this is for no footer lines
            if (numFooterLines == 0) {
                //System.out.println("the initialize without footer lines");
                String rowData = readOneRowDataString();
                //System.out.println("The initial buffere vector is "+rowData.toString());
                footerBuffer.add(rowData);
            }

            initializedFooterBuffer = true;
        }

        String nextRowData = readOneRowDataString();
        //System.out.println("the row string data from next row "+nextRowData.toString());
        String oneRowDataString = null;
        Vector oneRowDataVector = new Vector();

        if (nextRowData != null) {
            //System.out.println("before nextRowData is empty and nextRowData is "+nextRowData.toString());
            oneRowDataString = (String) footerBuffer.remove(0);
            reIndexFooterBufferVector();
            footerBuffer.add(nextRowData);
        } else if (numFooterLines == 0 && !footerBuffer.isEmpty()) {
            //System.out.println("find the last line in fottlines num is 0!!!!!!!!");
            oneRowDataString = (String) footerBuffer.remove(0);
        }

        //System.out.println("helere!!!");
        if (oneRowDataString != null) {
            //log.debug("in dataReader is not null");
            oneRowDataVector = splitDelimitedRowStringIntoVector(oneRowDataString);
        }
        //System.out.println("the row data from buffer "+oneRowDataVector.toString());
        return oneRowDataVector;
    }

    /*
     * This method will read a row data from vector. It
     * discard the header lines. but it doesn't discard footer lines
     * This method will be called by getRowDataVectorFromStream
     * 
     * @return   A string holding one row of data.
     */
    private String readOneRowDataString() {
        //Vector oneRowDataVector = new Vector();
        StringBuffer rowData = new StringBuffer();
        String rowDataString = null;
        int singleCharactor; // = -2;

        if (dataReader != null) {
            //log.debug("in dataReader is not null");
            try {
                //read the first character to start things off  
                singleCharactor = dataReader.read();
                while (singleCharactor != -1) {
                    //log.debug("singleCharactor is not EOF");
                    char charactor = (char) singleCharactor;
                    rowData.append(charactor);

                    // find string - line ending in the row data   
                    if (rowData.indexOf(lineEnding) != -1) {
                        //log.debug("found line ending");
                        // strip the header lines
                        if (stripHeader && numHeaderLines > 0 && headLineNumberCount < numHeaderLines) {
                            // reset string buffer(discard the header line)
                            rowData = null;
                            rowData = new StringBuffer();

                        } else {
                            rowDataString = rowData.toString();
                            //log.debug("The row data is " + rowDataString);
                            break;
                        }

                        headLineNumberCount++;
                    }
                    //read the next character before looping back
                    singleCharactor = dataReader.read();
                }
            } catch (Exception e) {
                //log.debug("Couldn't read data from input stream");
                e.printStackTrace();
                rowData = new StringBuffer();
            }
        }

        //if we have data for the row, then return it
        if (rowData != null && rowData.length() > 0) {
            rowDataString = rowData.toString();
        }
        //System.out.println("the row data before return is "+rowDataString);
        return rowDataString;
    }

    /*
     * This method will forward one index for every element, 1 -> 0, 2->1
     */
    private void reIndexFooterBufferVector() {
        for (int i = 0; i < numFooterLines - 2; i++) {
            Vector element = (Vector) footerBuffer.elementAt(i + 1);
            footerBuffer.add(i, element);
        }
    }

    /*
     * This method will read a delimitered string and put a delimitered part into
     * an element in a vector. If the vector size is less than the column number
     * empty string will be added.
     * 
     */
    private Vector splitDelimitedRowStringIntoVector(String data) throws Exception {
        Vector result = new Vector();

        if (data == null) {
            return result;
        }

        String[] s = null;
        if (quoteCharacter == null && literalCharacter == null) {
            //In this path, there is no quote character, we can spit data directly
            if (!collapseDelimiters) {
                s = data.split(delimiter);
            } else {
                String newDelimiterWithRegExpress = delimiter + "+";
                s = data.split(newDelimiterWithRegExpress);

            }
        } else {
            //In this path, we should skip any field delimiter in quote charcter.
            s = processQuoteCharacterOneRowData(data);
        }

        if (s != null) {
            int columnCounter = s.length;

            if (columnCounter > numCols) {
                throw new DataNotMatchingMetadataException(
                        "Metadata sees data has " + numCols + " columns, but actually data has " + columnCounter
                                + " columns. " + "Please make sure metadata is correct!");
            }

            for (int j = 0; j < s.length; j++) {

                if (s[j] != null) {
                    result.addElement(s[j].trim());
                } else {
                    result.addElement("");
                }
            }

            //add any elements that aren't there so that all the records have the
            //same number of cols
            if (result.size() < numCols) {
                int vsize = result.size();
                for (int j = 0; j < numCols - vsize; j++) {
                    result.addElement("");
                }
            }
        }

        return result;
    }

    /*
     * In oneRowData, there are quote character in it. Any field delimiter in the
     * quotes should be skiped.
     */
    private String[] processQuoteCharacterOneRowData(String oneRowData) throws Exception {
        String[] elements = null;
        Vector elementsVector = new Vector();
        if (oneRowData == null) {
            return elements;
        }
        quoteCharacter = transferQuoteCharacter(quoteCharacter);
        char quote = '#';
        boolean quoted = false;
        if (quoteCharacter != null) {
            quoted = true;
            quote = quoteCharacter.charAt(0);
        }
        char literal = '/';
        boolean literaled = false;
        if (literalCharacter != null) {
            literaled = true;
            literal = literalCharacter.charAt(0);
        }
        if (literaled && literalCharacter.length() != 1) {
            throw new Exception("Literal Character length should be 1 character in EML");
        }
        char currentChar = '2';
        StringBuffer fieldData = new StringBuffer();
        int length = oneRowData.length();
        int priviousDelimiterIndex = -2;
        int currentDelimiterIndex = -2;
        int delimiterLength = delimiter.length();
        boolean startQuote = false;
        boolean delimiterAtEnd = false;
        //this string buffer is only for deleting if hit a delimiter
        StringBuffer delimiterStorage = new StringBuffer(delimiter.length());
        for (int i = 0; i < length; i++) {
            currentChar = oneRowData.charAt(i);
            //System.out.println("current char is "+currentChar);
            fieldData.append(currentChar);
            if (i < delimiterLength) {
                delimiterStorage.append(currentChar);
            } else {
                //delimiterStorage.deleteCharAt(position);
                delimiterStorage = shiftBuffer(delimiterStorage, currentChar);
            }
            //System.out.println("current delimiter storage content is "+delimiterStorage.toString());
            //System.out.println("currnet value in the string buffer is "+fieldData.toString());
            // we should check if there is quoteCharacter in the string.
            if (quoted && currentChar == quote) {
                char previousChar = '1';
                boolean escapingQuote = false;
                // check if this quote is escaped
                if (literaled) {
                    if ((i - 1) >= 0) {
                        previousChar = oneRowData.charAt(i - 1);
                        if (previousChar == literal) {
                            escapingQuote = true;
                            // delette the literal character
                            if (!includeLiteralCharacter) {
                                //if we don't want literal character in the data,
                                //we should delete literal character.
                                int fieldLength = fieldData.length();
                                if ((fieldLength - 1 - 1) >= 0) {
                                    fieldData.deleteCharAt(fieldLength - 1 - 1);
                                }
                            }
                        }
                    }
                }
                if (!escapingQuote) {
                    if (!startQuote) {
                        //System.out.println("start quote");
                        startQuote = true;
                    } else {
                        //System.out.println("end quote");
                        // at end of quote
                        //put string buffers value into vector and reset string buffer
                        startQuote = false;

                    }
                }

            }

            //found a delimiter
            if (delimiterStorage.indexOf(delimiter) != -1 && !startQuote) {

                //check if there is literal character before the delimiter,
                //if does, this we should skip this delmiter
                int indexOfCharBeforeDelimiter = i - delimiterLength;
                boolean escapeDelimiter = false;
                if (literaled && indexOfCharBeforeDelimiter >= 0) {
                    char charBeforeDelimiter = oneRowData.charAt(indexOfCharBeforeDelimiter);
                    ////there is a literal character before delimiter we should skip this demlimiter
                    if (charBeforeDelimiter == literal) {
                        if (!includeLiteralCharacter) {
                            //if we don't want literal character in the data,
                            //we should delete literal character.
                            int fieldLength = fieldData.length();
                            if ((fieldLength - delimiterLength - 1) >= 0) {
                                fieldData.deleteCharAt(fieldLength - delimiterLength - 1);
                            }
                        }
                        escapeDelimiter = true;
                        continue;
                    }
                }

                // check if the delimiter is in the end of the string
                if (i == (length - 1) && !startQuote && !escapeDelimiter) {
                    delimiterAtEnd = true;
                }

                ////here we should treat sequential delimiter as single delimiter
                if (collapseDelimiters) {
                    priviousDelimiterIndex = currentDelimiterIndex;
                    currentDelimiterIndex = i;
                    //there is nothing between two delimiter, should skip it.
                    if ((currentDelimiterIndex - priviousDelimiterIndex) == delimiterLength) {
                        //delete sequnced delimiter
                        fieldData = new StringBuffer();
                        continue;
                    }
                }

                String value = "";
                int delimiterIndex = fieldData.lastIndexOf(delimiter);
                if (delimiterIndex == 0) {
                    //this path means field data on has delimiter, no real data
                    value = "";
                } else {
                    value = fieldData.substring(0, delimiterIndex);

                }
                elementsVector.add(value);
                //reset string buffer fieldData
                fieldData = new StringBuffer();

            }
        }
        // if startQuote is true at the end, which means there is no close quote character in this row,
        // code should throw an exception
        if (startQuote) {
            throw new Exception("There is a un-closed quote in data file");
        }
        // add last field. If this string end of delimiter, we need add a ""
        // else, we need to add the value in string buffer.
        String lastFieldValue = null;
        if (delimiterAtEnd == true) {
            //this path means field data on has delimiter, no real data
            lastFieldValue = "";
        } else {
            lastFieldValue = fieldData.toString();

        }
        elementsVector.add(lastFieldValue);
        //transform vector to string array
        int size = elementsVector.size();
        elements = new String[size];
        for (int i = 0; i < size; i++) {
            elements[i] = (String) elementsVector.elementAt(i);
        }
        return elements;
    }

    /*
     * This method will delete the most left char in the given buffer,
     * and append the new charact at the end. So the buffer size will 
     * keep same
     */
    private static StringBuffer shiftBuffer(StringBuffer buffer, char newChar) {
        StringBuffer newBuffer = new StringBuffer();
        if (buffer == null) {
            return newBuffer;
        }
        int size = buffer.length();
        for (int i = 0; i < size; i++) {
            char oldChar = buffer.charAt(i);
            if (i > 0) {
                newBuffer.append(oldChar);
            }
        }
        newBuffer.append(newChar);
        return newBuffer;
    }

    /*
     * If quote character is specified by hex number, we should transfer it
     * to a character. If quote character is longer than 1 character, it
     * should be throw an exception
     */
    private String transferQuoteCharacter(String quote) throws Exception {
        String newQuote = quote;
        if (newQuote == null) {
            return newQuote;
        } else if (newQuote.startsWith("#") && newQuote.length() > 1) {
            //log.debug("XML entity charactor.");
            String digits = newQuote.substring(1, newQuote.length());
            int radix = 10;
            if (digits.startsWith("x")) {
                //log.debug("Radix is "+ 16);
                radix = 16;
                digits = digits.substring(1, digits.length());
            }
            //log.debug("Int value of  delimiter is "+digits);

            newQuote = transferDigitsToCharString(radix, digits);

        } else if ((newQuote.startsWith("0x") || newQuote.startsWith("0X")) && newQuote.length() > 2) {
            int radix = 16;
            String digits = newQuote.substring(2, newQuote.length());
            //log.debug("Int value of  delimiter is "+digits);
            newQuote = transferDigitsToCharString(radix, digits);
        }

        if (newQuote.length() > 1) {
            throw new Exception("Quote Character length should be 1 character in EML");
        }
        return newQuote;
    }

    /**
     * Returns the data as an array of vectors.  Each vector will have the same
     * number of elements as there are columns in the data.
     * 
     * @param stripHeaderLines true if the header lines should not be included
     *                         in the returned data, false otherwise
     */
    public Vector[] getTokenizedData(boolean stripHeaderLines) {
        if (stripHeaderLines) {
            Vector[] strip = null;

            if (numRecords > numHeaderLines) {
                strip = new Vector[numRecords - numHeaderLines];

                for (int i = numHeaderLines; i < lines.length; i++) {
                    strip[i - numHeaderLines] = lines[i];
                }
            }

            return strip;
        } else {
            return lines;
        }
    }

    /**
     * Returns a string representation of the data.
     * 
     * @return the String representation of the data
     */
    public String toString() {
        StringBuffer sb = new StringBuffer();

        for (int i = 0; i < lines.length; i++) {
            //log.debug("line[" + (i + 1) + "]: " + lines[i].toString());

            for (int j = 0; j < lines[i].size(); j++) {
                sb.append((String) lines[i].elementAt(j));
                if (j != lines[i].size() - 1) {
                    sb.append(" || ");
                }
            }

            sb.append(lineEnding);
        }

        return sb.toString();
    }

}