Java tutorial
/* * Copyright (c) 2003-2010 The Regents of the University of California. * All rights reserved. * * '$Author: welker $' * '$Date: 2010-05-05 22:21:26 -0700 (Wed, 05 May 2010) $' * '$Revision: 24234 $' * * Permission is hereby granted, without written agreement and without * license or royalty fees, to use, copy, modify, and distribute this * software and its documentation for any purpose, provided that the above * copyright notice and the following two paragraphs appear in all copies * of this software. * * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, * ENHANCEMENTS, OR MODIFICATIONS. * */ package org.kepler.objectmanager.data.text; import java.io.InputStream; import java.util.Vector; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.kepler.objectmanager.data.db.Attribute; import org.kepler.objectmanager.data.db.Entity; import org.kepler.util.DelimitedReader; /** * @author tao This class will read a data inputstream and split them into a row * vectors base on the given ComplexDataFormat array. This class have a * public method - getRowVector.After reach the end of stream, empty * vector will be returned. So this method can be iterated by a while * loop until a empty vector hited. During the iteration, every data in * the stream will be pulled out. */ public class TextComplexFormatDataReader { private InputStream dataStream = null; private Entity entity = null; private boolean stripHeader = true; private int numberOfAttirbute = 0; private TextComplexDataFormat[] formats = null; private String physicalLineDelimiter = null; private int numberOfHeaderLines = 0; private int physicalLineDelimiterLength = 0; private int headLineNumberCount = 0; private static Log log; static { log = LogFactory.getLog("org.kepler.objectmanager.data.text.TextComplexFormatDataReader"); } // constants public static final String DEFAULTVALUE = ""; /** * Consturctor with default stripHeader value - true * * @param dataStream * @param entity */ public TextComplexFormatDataReader(InputStream dataStream, Entity entity) throws Exception { this(dataStream, entity, true); } /** * Constructor with assigned stripHeader value * * @param dataStream * the data input stream * @param enity * the entity metadata to describe the data stream * @param stripHeader * if strip header when we hand read the input stream */ public TextComplexFormatDataReader(InputStream dataStream, Entity entity, boolean stripHeader) throws Exception { if (dataStream == null || entity == null) { throw new Exception("Data inputstream or entity metadata is null"); } this.dataStream = dataStream; this.entity = entity; this.stripHeader = stripHeader; getParameterFromEntity(); } /* * Method to set up other parameter will be used in the reader. Sucha as * numberOfArributes, physicalLineDelimiter. */ private void getParameterFromEntity() throws Exception { Attribute[] attributeList = entity.getAttributes(); if (attributeList == null) { throw new Exception("Attribute in entity metadata is null"); } else { numberOfAttirbute = attributeList.length; } numberOfHeaderLines = entity.getNumHeaderLines(); if (numberOfHeaderLines == -1) { numberOfHeaderLines = 0; } // physicalLineDelmiter will get from physicalDelimiter elements // if no physicalDelimiter element, we will try record delimter physicalLineDelimiter = entity.getPhysicalLineDelimiter(); if (physicalLineDelimiter == null) { physicalLineDelimiter = entity.getRecordDelimiter(); } physicalLineDelimiter = DelimitedReader.unescapeDelimiter(physicalLineDelimiter); physicalLineDelimiterLength = physicalLineDelimiter.length(); formats = entity.getDataFormatArray(); if (formats == null) { throw new Exception("Complext format is null in metadata entity"); } else { int length = formats.length; if (length != numberOfAttirbute) { throw new Exception("Complex formats should have same number as attribute number"); } } } /** * This method will read one row from inputstream and return a data vector * which element is String and the value is field data. After reach the end * of stream, empty vector will be returned. So this method can be iterated * by a while loop until a empty vector hited. During the iteration, every * data in the stream will be pulled out. * * @return Vector */ public Vector getRowDataVectorFromStream() throws Exception { Vector oneRowDataVector = new Vector(); StringBuffer lineDelimiterBuffer = new StringBuffer();// to store // delmiter StringBuffer fieldValueBuffer = new StringBuffer(); int singleCharactor = -2; int columnCount = 1;// this is for every character in one row int attributeCount = 0; // this is for every attribute boolean startNewAttribute = true; boolean isWidthFix = true; int width = -1; int widthCount = 0; boolean startWidthCount = false; int startColumnNumberFromFormat = -1; String fieldDelimiter = null; if (dataStream != null) { singleCharactor = dataStream.read(); while (singleCharactor != -1) { char charactor = (char) singleCharactor; // strip header if (stripHeader && numberOfHeaderLines > 0 && headLineNumberCount < numberOfHeaderLines) { lineDelimiterBuffer.append(charactor); if (lineDelimiterBuffer.length() == physicalLineDelimiterLength && lineDelimiterBuffer.toString().equals(physicalLineDelimiter)) { // reset the delimiter buffer lineDelimiterBuffer = new StringBuffer(); headLineNumberCount++; } else if (lineDelimiterBuffer.length() == physicalLineDelimiterLength) { // reset the delimiter buffer lineDelimiterBuffer = new StringBuffer(); } } else { // handle data after strip header fieldValueBuffer.append(charactor); lineDelimiterBuffer.append(charactor); // set up format info if (startNewAttribute) { startNewAttribute = false; // find the format from array TextComplexDataFormat format = formats[attributeCount]; if (format == null) { throw new Exception("The text format is null for an attribute"); } else if (format instanceof TextWidthFixedDataFormat) { TextWidthFixedDataFormat widthFormat = (TextWidthFixedDataFormat) format; width = widthFormat.getFieldWidth(); startColumnNumberFromFormat = widthFormat.getFieldStartColumn(); isWidthFix = true; startWidthCount = false; } else if (format instanceof TextDelimitedDataFormat) { TextDelimitedDataFormat delimitedFormat = (TextDelimitedDataFormat) format; fieldDelimiter = delimitedFormat.getFieldDelimiter(); isWidthFix = false; } } if (isWidthFix) { // find start cloumn if metadata specify it if (startColumnNumberFromFormat != -1 && startColumnNumberFromFormat == columnCount) { fieldValueBuffer = new StringBuffer(); fieldValueBuffer.append(charactor); startWidthCount = true; } else if (startColumnNumberFromFormat == -1) { startWidthCount = true; } // start count width if (startWidthCount) { widthCount++; } // we got the value when widthcount reach width of this // format if (widthCount == width) { String value = fieldValueBuffer.toString(); log.debug("Add width fixed attribute value " + value + " to the vector"); oneRowDataVector.add(value.trim()); widthCount = 0; startWidthCount = false; fieldValueBuffer = new StringBuffer(); startNewAttribute = true; attributeCount++; } } else { // for delimter data if (fieldValueBuffer.toString().endsWith(fieldDelimiter)) { String value = fieldValueBuffer.toString(); value = value.substring(0, value.length() - fieldDelimiter.length()); log.debug("Add delimited attribute value " + value + " to the vector"); oneRowDataVector.add(value.trim()); fieldValueBuffer = new StringBuffer(); startNewAttribute = true; attributeCount++; } } columnCount++; // reset columnCount to 1 when hit a physical line delimiter if (lineDelimiterBuffer.length() == physicalLineDelimiterLength && lineDelimiterBuffer.toString().equals(physicalLineDelimiter)) { // reset the delimiter buffer lineDelimiterBuffer = new StringBuffer(); columnCount = 1; } else if (lineDelimiterBuffer.length() == physicalLineDelimiterLength) { // reset the delimiter buffer lineDelimiterBuffer = new StringBuffer(); } // get a row vector break it. if (attributeCount == numberOfAttirbute) { break; } } singleCharactor = dataStream.read(); } } // if row vector is not empty and its length less than number of // attribute, // we should add "" string to make its' length equals attribute length; if (!oneRowDataVector.isEmpty() && oneRowDataVector.size() < numberOfAttirbute) { int size = oneRowDataVector.size(); for (int i = size; i < numberOfAttirbute; i++) { oneRowDataVector.add(DEFAULTVALUE); } } return oneRowDataVector; } /** * @return Returns the dataStream. */ public InputStream getDataStream() { return dataStream; } /** * @param dataStream * The dataStream to set. */ public void setDataStream(InputStream dataStream) { this.dataStream = dataStream; } /** * @return Returns the entity. */ public Entity getEntity() { return entity; } /** * @param entity * The entity to set. */ public void setEntity(Entity entity) { this.entity = entity; } }