org.kepler.objectmanager.data.text.TextComplexFormatDataReader.java Source code

Java tutorial

Introduction

Here is the source code for org.kepler.objectmanager.data.text.TextComplexFormatDataReader.java

Source

/*
 * Copyright (c) 2003-2010 The Regents of the University of California.
 * All rights reserved.
 *
 * '$Author: welker $'
 * '$Date: 2010-05-05 22:21:26 -0700 (Wed, 05 May 2010) $' 
 * '$Revision: 24234 $'
 * 
 * Permission is hereby granted, without written agreement and without
 * license or royalty fees, to use, copy, modify, and distribute this
 * software and its documentation for any purpose, provided that the above
 * copyright notice and the following two paragraphs appear in all copies
 * of this software.
 *
 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
 * ENHANCEMENTS, OR MODIFICATIONS.
 *
 */

package org.kepler.objectmanager.data.text;

import java.io.InputStream;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.kepler.objectmanager.data.db.Attribute;
import org.kepler.objectmanager.data.db.Entity;
import org.kepler.util.DelimitedReader;

/**
 * @author tao This class will read a data inputstream and split them into a row
 *         vectors base on the given ComplexDataFormat array. This class have a
 *         public method - getRowVector.After reach the end of stream, empty
 *         vector will be returned. So this method can be iterated by a while
 *         loop until a empty vector hited. During the iteration, every data in
 *         the stream will be pulled out.
 */
public class TextComplexFormatDataReader {
    private InputStream dataStream = null;
    private Entity entity = null;
    private boolean stripHeader = true;
    private int numberOfAttirbute = 0;
    private TextComplexDataFormat[] formats = null;
    private String physicalLineDelimiter = null;
    private int numberOfHeaderLines = 0;
    private int physicalLineDelimiterLength = 0;
    private int headLineNumberCount = 0;

    private static Log log;

    static {
        log = LogFactory.getLog("org.kepler.objectmanager.data.text.TextComplexFormatDataReader");
    }

    // constants
    public static final String DEFAULTVALUE = "";

    /**
     * Consturctor with default stripHeader value - true
     * 
     * @param dataStream
     * @param entity
     */
    public TextComplexFormatDataReader(InputStream dataStream, Entity entity) throws Exception {
        this(dataStream, entity, true);
    }

    /**
     * Constructor with assigned stripHeader value
     * 
     * @param dataStream
     *            the data input stream
     * @param enity
     *            the entity metadata to describe the data stream
     * @param stripHeader
     *            if strip header when we hand read the input stream
     */
    public TextComplexFormatDataReader(InputStream dataStream, Entity entity, boolean stripHeader)
            throws Exception {
        if (dataStream == null || entity == null) {
            throw new Exception("Data inputstream or entity metadata is null");
        }
        this.dataStream = dataStream;
        this.entity = entity;
        this.stripHeader = stripHeader;
        getParameterFromEntity();

    }

    /*
     * Method to set up other parameter will be used in the reader. Sucha as
     * numberOfArributes, physicalLineDelimiter.
     */
    private void getParameterFromEntity() throws Exception {
        Attribute[] attributeList = entity.getAttributes();
        if (attributeList == null) {
            throw new Exception("Attribute in entity metadata is null");
        } else {
            numberOfAttirbute = attributeList.length;
        }

        numberOfHeaderLines = entity.getNumHeaderLines();
        if (numberOfHeaderLines == -1) {
            numberOfHeaderLines = 0;
        }

        // physicalLineDelmiter will get from physicalDelimiter elements
        // if no physicalDelimiter element, we will try record delimter
        physicalLineDelimiter = entity.getPhysicalLineDelimiter();
        if (physicalLineDelimiter == null) {
            physicalLineDelimiter = entity.getRecordDelimiter();
        }
        physicalLineDelimiter = DelimitedReader.unescapeDelimiter(physicalLineDelimiter);
        physicalLineDelimiterLength = physicalLineDelimiter.length();

        formats = entity.getDataFormatArray();
        if (formats == null) {
            throw new Exception("Complext format is null in metadata entity");
        } else {
            int length = formats.length;
            if (length != numberOfAttirbute) {
                throw new Exception("Complex formats should have same number as attribute number");
            }
        }
    }

    /**
     * This method will read one row from inputstream and return a data vector
     * which element is String and the value is field data. After reach the end
     * of stream, empty vector will be returned. So this method can be iterated
     * by a while loop until a empty vector hited. During the iteration, every
     * data in the stream will be pulled out.
     * 
     * @return Vector
     */
    public Vector getRowDataVectorFromStream() throws Exception {
        Vector oneRowDataVector = new Vector();
        StringBuffer lineDelimiterBuffer = new StringBuffer();// to store
        // delmiter
        StringBuffer fieldValueBuffer = new StringBuffer();
        int singleCharactor = -2;
        int columnCount = 1;// this is for every character in one row
        int attributeCount = 0; // this is for every attribute
        boolean startNewAttribute = true;
        boolean isWidthFix = true;
        int width = -1;
        int widthCount = 0;
        boolean startWidthCount = false;
        int startColumnNumberFromFormat = -1;
        String fieldDelimiter = null;

        if (dataStream != null) {
            singleCharactor = dataStream.read();

            while (singleCharactor != -1) {
                char charactor = (char) singleCharactor;
                // strip header
                if (stripHeader && numberOfHeaderLines > 0 && headLineNumberCount < numberOfHeaderLines) {
                    lineDelimiterBuffer.append(charactor);
                    if (lineDelimiterBuffer.length() == physicalLineDelimiterLength
                            && lineDelimiterBuffer.toString().equals(physicalLineDelimiter)) {
                        // reset the delimiter buffer
                        lineDelimiterBuffer = new StringBuffer();
                        headLineNumberCount++;
                    } else if (lineDelimiterBuffer.length() == physicalLineDelimiterLength) {
                        // reset the delimiter buffer
                        lineDelimiterBuffer = new StringBuffer();
                    }

                } else {
                    // handle data after strip header
                    fieldValueBuffer.append(charactor);
                    lineDelimiterBuffer.append(charactor);

                    // set up format info
                    if (startNewAttribute) {
                        startNewAttribute = false;
                        // find the format from array
                        TextComplexDataFormat format = formats[attributeCount];
                        if (format == null) {
                            throw new Exception("The text format is null for an attribute");
                        } else if (format instanceof TextWidthFixedDataFormat) {
                            TextWidthFixedDataFormat widthFormat = (TextWidthFixedDataFormat) format;
                            width = widthFormat.getFieldWidth();
                            startColumnNumberFromFormat = widthFormat.getFieldStartColumn();
                            isWidthFix = true;
                            startWidthCount = false;

                        } else if (format instanceof TextDelimitedDataFormat) {
                            TextDelimitedDataFormat delimitedFormat = (TextDelimitedDataFormat) format;
                            fieldDelimiter = delimitedFormat.getFieldDelimiter();
                            isWidthFix = false;
                        }
                    }

                    if (isWidthFix) {
                        // find start cloumn if metadata specify it
                        if (startColumnNumberFromFormat != -1 && startColumnNumberFromFormat == columnCount) {
                            fieldValueBuffer = new StringBuffer();
                            fieldValueBuffer.append(charactor);
                            startWidthCount = true;
                        } else if (startColumnNumberFromFormat == -1) {
                            startWidthCount = true;
                        }
                        // start count width
                        if (startWidthCount) {
                            widthCount++;
                        }
                        // we got the value when widthcount reach width of this
                        // format
                        if (widthCount == width) {
                            String value = fieldValueBuffer.toString();
                            log.debug("Add width fixed attribute value " + value + " to the vector");
                            oneRowDataVector.add(value.trim());
                            widthCount = 0;
                            startWidthCount = false;
                            fieldValueBuffer = new StringBuffer();
                            startNewAttribute = true;
                            attributeCount++;
                        }

                    } else {
                        // for delimter data
                        if (fieldValueBuffer.toString().endsWith(fieldDelimiter)) {
                            String value = fieldValueBuffer.toString();
                            value = value.substring(0, value.length() - fieldDelimiter.length());
                            log.debug("Add delimited attribute value " + value + " to the vector");
                            oneRowDataVector.add(value.trim());
                            fieldValueBuffer = new StringBuffer();
                            startNewAttribute = true;
                            attributeCount++;
                        }
                    }

                    columnCount++;

                    // reset columnCount to 1 when hit a physical line delimiter
                    if (lineDelimiterBuffer.length() == physicalLineDelimiterLength
                            && lineDelimiterBuffer.toString().equals(physicalLineDelimiter)) {
                        // reset the delimiter buffer
                        lineDelimiterBuffer = new StringBuffer();
                        columnCount = 1;
                    } else if (lineDelimiterBuffer.length() == physicalLineDelimiterLength) {
                        // reset the delimiter buffer
                        lineDelimiterBuffer = new StringBuffer();
                    }

                    // get a row vector break it.
                    if (attributeCount == numberOfAttirbute) {
                        break;
                    }
                }
                singleCharactor = dataStream.read();
            }

        }
        // if row vector is not empty and its length less than number of
        // attribute,
        // we should add "" string to make its' length equals attribute length;
        if (!oneRowDataVector.isEmpty() && oneRowDataVector.size() < numberOfAttirbute) {
            int size = oneRowDataVector.size();
            for (int i = size; i < numberOfAttirbute; i++) {
                oneRowDataVector.add(DEFAULTVALUE);
            }
        }
        return oneRowDataVector;
    }

    /**
     * @return Returns the dataStream.
     */
    public InputStream getDataStream() {
        return dataStream;
    }

    /**
     * @param dataStream
     *            The dataStream to set.
     */
    public void setDataStream(InputStream dataStream) {
        this.dataStream = dataStream;
    }

    /**
     * @return Returns the entity.
     */
    public Entity getEntity() {
        return entity;
    }

    /**
     * @param entity
     *            The entity to set.
     */
    public void setEntity(Entity entity) {
        this.entity = entity;
    }
}