Java tutorial
/* Copyright (c) 2007 Pentaho Corporation. All rights reserved. * This software was developed by Pentaho Corporation and is provided under the terms * of the GNU Lesser General Public License, Version 2.1. You may not use * this file except in compliance with the license. If you need a copy of the license, * please go to http://www.gnu.org/licenses/lgpl-2.1.txt. The Original Code is Pentaho * Data Integration. The Initial Developer is Pentaho Corporation. * * Software distributed under the GNU Lesser Public License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. Please refer to * the license for the specific language governing your rights and limitations.*/ package com.huawei.unibi.molap.csvreaderstep; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import org.apache.commons.vfs.FileObject; import org.pentaho.di.core.Const; import org.pentaho.di.core.ResultFile; import org.pentaho.di.core.exception.KettleConversionException; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleFileException; import org.pentaho.di.core.exception.KettleValueException; import org.pentaho.di.core.logging.LogChannelInterface; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import org.pentaho.di.trans.steps.textfileinput.EncodingType; import com.huawei.iweb.platform.logging.LogService; import com.huawei.iweb.platform.logging.LogServiceFactory; import com.huawei.unibi.molap.constants.MolapCommonConstants; import com.huawei.unibi.molap.datastorage.store.impl.FileFactory; import com.huawei.unibi.molap.util.MolapDataProcessorLogEvent; /** * Read a simple CSV file * Just output Strings found in the file... * * @author Matt * @since 2007-07-05 */ public class CsvInput extends BaseStep implements StepInterface { private static final Class<?> PKG = CsvInput.class; // for i18n purposes, needed by Translator2!! $NON-NLS-1$ private static final LogService LOGGER = LogServiceFactory.getLogService(CsvInput.class.getName()); private CsvInputMeta meta; private CsvInputData data; public CsvInput(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans) { super(stepMeta, stepDataInterface, copyNr, transMeta, trans); LOGGER.info(MolapDataProcessorLogEvent.UNIBI_MOLAPDATAPROCESSOR_MSG, "** Using csv file **"); //System.out.println("****************** Using my csv file"); } public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException { meta = (CsvInputMeta) smi; data = (CsvInputData) sdi; if (first) { first = false; data.outputRowMeta = new RowMeta(); meta.getFields(data.outputRowMeta, getStepname(), null, null, this); if (data.filenames == null) { // We're expecting the list of filenames from the previous step(s)... // getFilenamesFromPreviousSteps(); } // We only run in parallel if we have at least one file to process // AND if we have more than one step copy running... // data.parallel = meta.isRunningInParallel() && data.totalNumberOfSteps > 1; // The conversion logic for when the lazy conversion is turned of is simple: // Pretend it's a lazy conversion object anyway and get the native type during conversion. // data.convertRowMeta = data.outputRowMeta.clone(); for (ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList()) { valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_BINARY_STRING); } // Calculate the indexes for the filename and row number fields // data.filenameFieldIndex = -1; if (!Const.isEmpty(meta.getFilenameField()) && meta.isIncludingFilename()) { data.filenameFieldIndex = meta.getInputFields().length; } data.rownumFieldIndex = -1; if (!Const.isEmpty(meta.getRowNumField())) { data.rownumFieldIndex = meta.getInputFields().length; if (data.filenameFieldIndex >= 0) { data.rownumFieldIndex++; } } // Now handle the parallel reading aspect: determine total of all the file sizes // Then skip to the appropriate file and location in the file to start reading... // Also skip to right after the first newline // if (data.parallel) { prepareToRunInParallel(); } // Open the next file... // if (!openNextFile()) { setOutputDone(); return false; // nothing to see here, move along... } } // If we are running in parallel, make sure we don't read too much in this step copy... // // if (data.parallel) { // if (data.totalBytesRead>data.blockToRead) { // setOutputDone(); // stop reading // return false; // } // } try { Object[] outputRowData = readOneRow(true); // get row, set busy! if (outputRowData == null) // no more input to be expected... { if (openNextFile()) { return true; // try again on the next loop... } else { setOutputDone(); // last file, end here return false; } } else { incrementLinesRead(); putRow(data.outputRowMeta, outputRowData); // copy row to possible alternate rowset(s). verifyRejectionRates(); if (checkFeedback(getLinesInput())) { if (log.isBasic()) { logBasic(BaseMessages.getString(PKG, "CsvInput.Log.LineNumber", //$NON-NLS-1$ Long.toString(getLinesInput()))); } } } } catch (KettleConversionException e) { if (getStepMeta().isDoingErrorHandling()) { StringBuffer errorDescriptions = new StringBuffer(100); StringBuffer errorFields = new StringBuffer(50); for (int i = 0; i < e.getCauses().size(); i++) { if (i > 0) { errorDescriptions.append(", "); //$NON-NLS-1$ errorFields.append(", "); //$NON-NLS-1$ } errorDescriptions.append(e.getCauses().get(i).getMessage()); errorFields.append(e.getFields().get(i).toStringMeta()); } putError(data.outputRowMeta, e.getRowData(), e.getCauses().size(), errorDescriptions.toString(), errorFields.toString(), "CSVINPUT001"); //$NON-NLS-1$ } else { // Only forward the first cause. // throw new KettleException(e.getMessage(), e.getCauses().get(0)); } } return true; } private void prepareToRunInParallel() throws KettleException { // try { // // At this point it doesn't matter if we have 1 or more files. // // We'll use the same algorithm... // // // for (String filename : data.filenames) { // long size = KettleVFS.getFileObject(filename, getTransMeta()).getContent().getSize(); // data.fileSizes.add(size); // data.totalFileSize+=size; // } // // // Now we can determine the range to read. // // // // For example, the total file size is 50000, spread over 5 files of 10000 // // Suppose we have 2 step copies running (clustered or not) // // That means step 0 has to read 0-24999 and step 1 has to read 25000-49999 // // // // The size of the block to read (25000 in the example) : // // // data.blockToRead = Math.round( (double)data.totalFileSize / (double)data.totalNumberOfSteps ); // // // Now we calculate the position to read (0 and 25000 in our sample) : // // // data.startPosition = data.blockToRead * data.stepNumber; // data.endPosition = data.startPosition + data.blockToRead; // // // Determine the start file number (0 or 2 in our sample) : // // >0<,1000,>2000<,3000,4000 // // // long totalFileSize=0L; // for (int i=0;i<data.fileSizes.size();i++) { // long size = data.fileSizes.get(i); // // // Start of file range: totalFileSize // // End of file range: totalFileSize+size // // if (data.startPosition>=totalFileSize && data.startPosition<totalFileSize+size) { // // This is the file number to start reading from... // // // data.filenr = i; // // // remember where we started to read to allow us to know that we have to skip the header row in the next files (if any) // // // data.startFilenr = i; // // // // How many bytes do we skip in that first file? // // // if (data.startPosition==0) { // data.bytesToSkipInFirstFile=0L; // } else { // data.bytesToSkipInFirstFile = data.startPosition - totalFileSize; // } // // break; // } // totalFileSize+=size; // } // // if (data.filenames.length > 0) // logBasic(BaseMessages.getString(PKG, "CsvInput.Log.ParallelFileNrAndPositionFeedback", data.filenames[data.filenr], Long.toString(data.fileSizes.get(data.filenr)), Long.toString(data.bytesToSkipInFirstFile), Long.toString(data.blockToRead))); //$NON-NLS-1$ // } // catch(Exception e) { // throw new KettleException(BaseMessages.getString(PKG, "CsvInput.Exception.ErrorPreparingParallelRun"), e); //$NON-NLS-1$ // } } private void getFilenamesFromPreviousSteps() throws KettleException { List<String> filenames = new ArrayList<String>(MolapCommonConstants.CONSTANT_SIZE_TEN); boolean firstRow = true; int index = -1; Object[] row = getRow(); while (row != null) { if (firstRow) { firstRow = false; // Get the filename field index... // String filenameField = environmentSubstitute(meta.getFilenameField()); index = getInputRowMeta().indexOfValue(filenameField); if (index < 0) { throw new KettleException( BaseMessages.getString(PKG, "CsvInput.Exception.FilenameFieldNotFound", filenameField)); //$NON-NLS-1$ } } String filename = getInputRowMeta().getString(row, index); filenames.add(filename); // add it to the list... row = getRow(); // Grab another row... } data.filenames = filenames.toArray(new String[filenames.size()]); logBasic(BaseMessages.getString(PKG, "CsvInput.Log.ReadingFromNrFiles", //$NON-NLS-1$ Integer.toString(data.filenames.length))); } @Override public void dispose(StepMetaInterface smi, StepDataInterface sdi) { try { // Close the previous file... // if (data.bufferedInputStream != null) { data.bufferedInputStream.close(); } } catch (Exception e) { logError("Error closing file channel", e); } // try { // if (data.fis!=null) { // data.fis.close(); // } // } catch(Exception e) { // logError("Error closing file input stream", e); // } super.dispose(smi, sdi); } protected boolean openNextFile() throws KettleException { try { // Close the previous file... // if (data.bufferedInputStream != null) { data.bufferedInputStream.close(); } // if (data.fis!=null) { // data.fis.close(); // } if (data.filenr >= data.filenames.length) { return false; } // Open the next one... // FileObject fileObject = KettleVFS.getFileObject(data.filenames[data.filenr], getTransMeta()); // if (!(fileObject instanceof LocalFile)) { // // We can only use NIO on local files at the moment, so that's what we limit ourselves to. // // // throw new KettleException(BaseMessages.getString(PKG, "CsvInput.Log.OnlyLocalFilesAreSupported")); //$NON-NLS-1$ // } if (meta.isLazyConversionActive()) { data.binaryFilename = data.filenames[data.filenr].getBytes(Charset.defaultCharset()); } initializeFileReader(fileObject); // If we are running in parallel and we need to skip bytes in the first file, let's do so here. // // if (data.parallel) { // if (data.bytesToSkipInFirstFile>0) { // data.fc.position(data.bytesToSkipInFirstFile); // // // Now, we need to skip the first row, until the first CR that is. // // // readOneRow(false); // } // } // Add filename to result filenames ? if (meta.isAddResultFile()) { ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_GENERAL, fileObject, getTransMeta().getName(), toString()); resultFile.setComment("File was read by a Csv input step"); addResultFile(resultFile); } // Move to the next filename // data.filenr++; // See if we need to skip a row... // - If you have a header row checked and if you're not running in parallel // - If you're running in parallel, if a header row is checked, if you're at the beginning of a file // if (meta.isHeaderPresent()) { if ((!data.parallel) || // Standard flat file : skip header (data.parallel && data.bytesToSkipInFirstFile <= 0)) { readHeader(); // skip this row. logBasic(BaseMessages.getString(PKG, "CsvInput.Log.HeaderRowSkipped", //$NON-NLS-1$ data.filenames[data.filenr - 1])); } } // Reset the row number pointer... // data.rowNumber = 1L; // Don't skip again in the next file... // data.bytesToSkipInFirstFile = -1L; return true; } catch (KettleException e) { throw e; } catch (Exception e) { throw new KettleException(e); } } public void readHeader() throws KettleException { readOneRow(false); } protected void initializeFileReader(FileObject fileObject) throws IOException { // data.fis = new FileInputStream(KettleVFS.getFilename(fileObject)); // data.fc = data.fis.getChannel(); // data.bb = ByteBuffer.allocateDirect( data.preferredBufferSize ); String filePath = KettleVFS.getFilename(fileObject); data.bufferedInputStream = FileFactory.getDataInputStream(KettleVFS.getFilename(fileObject), FileFactory.getFileType(filePath), data.preferredBufferSize); // data.bufferedInputStream=new BufferedInputStream(data.fis, data.preferredBufferSize); } /** * Check to see if the buffer size is large enough given the data.endBuffer pointer.<br> * Resize the buffer if there is not enough room. * * @return false if everything is OK, true if there is a problem and we should stop. * @throws IOException in case there is a I/O problem (read error) */ private boolean checkBufferSize() throws IOException { if (data.endBuffer >= data.bufferSize) { // Oops, we need to read more data... // Better resize this before we read other things in it... // data.resizeByteBufferArray(); // Also read another chunk of data, now that we have the space for it... // int n = data.readBufferFromFile(); // If we didn't manage to read something, we return true to indicate we're done // return n < 0; } return false; } /* private boolean isReturn(byte[] source, int location) { switch (data.encodingType) { case SINGLE: return source[location] == '\n'; case DOUBLE_BIG_ENDIAN: if (location >= 1) { return source[location - 1] == 0 && source[location] == 0x0d; } else { return false; } case DOUBLE_LITTLE_ENDIAN: if (location >= 1) { return source[location - 1] == 0x0d && source[location] == 0x00; } else { return false; } default: return source[location] == '\n'; } } private boolean isLineFeed(byte[] source, int location) { switch (data.encodingType) { case SINGLE: return source[location] == '\r'; case DOUBLE_BIG_ENDIAN: if (location >= 1) { return source[location - 1] == 0 && source[location] == 0x0a; } else { return false; } case DOUBLE_LITTLE_ENDIAN: if (location >= 1) { return source[location - 1] == 0x0a && source[location] == 0x00; } else { return false; } default: return source[location] == '\r'; } } */ /** Read a single row of data from the file... * * @param doConversions if you want to do conversions, set to false for the header row. * @return a row of data... * @throws KettleException */ private Object[] readOneRow(boolean doConversions) throws KettleException { try { Object[] outputRowData = RowDataUtil .allocateRowData(data.outputRowMeta.size() - RowDataUtil.OVER_ALLOCATE_SIZE); int outputIndex = 0; boolean newLineFound = false; boolean endOfBuffer = false; int newLines = 0; List<Exception> conversionExceptions = null; List<ValueMetaInterface> exceptionFields = null; // The strategy is as follows... // We read a block of byte[] from the file. // We scan for the separators in the file (NOT for line feeds etc) // Then we scan that block of data. // We keep a byte[] that we extend if needed.. // At the end of the block we read another, etc. // // Let's start by looking where we left off reading. // while (!newLineFound && outputIndex < meta.getInputFields().length) { /* if (getLinesInput()==5445) { System.out.println("Break!"); } */ if (checkBufferSize() && outputRowData != null) { // Last row was being discarded if the last item is null and // there is no end of line delimiter //if (outputRowData != null) { // Make certain that at least one record exists before // filling the rest of them with null if (outputIndex > 0) { return (outputRowData); } // } return null; // nothing more to read, call it a day. } // OK, at this point we should have data in the byteBuffer and we should be able to scan for the next // delimiter (;) // So let's look for a delimiter. // Also skip over the enclosures ("), it is NOT taking into account escaped enclosures. // Later we can add an option for having escaped or double enclosures in the file. <sigh> // boolean delimiterFound = false; boolean enclosureFound = false; int escapedEnclosureFound = 0; while (!delimiterFound) { // If we find the first char, we might find others as well ;-) // Single byte delimiters only for now. // if (data.delimiterMatcher.matchesPattern(data.byteBuffer, data.endBuffer, data.delimiter)) { delimiterFound = true; } // Perhaps we found a (pre-mature) new line? // else if ( // In case we are not using an enclosure and in case fields contain new lines // we need to make sure that we check the newlines possible flag. // If the flag is enable we skip newline checking except for the last field in the row. // In that one we can't support newlines without enclosure (handled below). // (!meta.isNewlinePossibleInFields() || outputIndex == meta.getInputFields().length - 1) && (data.crLfMatcher.isReturn(data.byteBuffer, data.endBuffer) || data.crLfMatcher.isLineFeed(data.byteBuffer, data.endBuffer))) { if (data.encodingType.equals(EncodingType.DOUBLE_LITTLE_ENDIAN) || data.encodingType.equals(EncodingType.DOUBLE_BIG_ENDIAN)) { data.endBuffer += 2; } else { data.endBuffer++; } data.totalBytesRead++; newLines = 1; if (data.endBuffer >= data.bufferSize) { // Oops, we need to read more data... // Better resize this before we read other things in it... // data.resizeByteBufferArray(); // Also read another chunk of data, now that we have the space for it... // Ignore EOF, there might be other stuff in the buffer. // data.readBufferFromFile(); } // re-check for double delimiters... if (data.crLfMatcher.isReturn(data.byteBuffer, data.endBuffer) || data.crLfMatcher.isLineFeed(data.byteBuffer, data.endBuffer)) { data.endBuffer++; data.totalBytesRead++; newLines = 2; if (data.endBuffer >= data.bufferSize) { // Oops, we need to read more data... // Better resize this before we read other things in it... // data.resizeByteBufferArray(); // Also read another chunk of data, now that we have the space for it... // Ignore EOF, there might be other stuff in the buffer. // data.readBufferFromFile(); } } newLineFound = true; delimiterFound = true; } // Perhaps we need to skip over an enclosed part? // We always expect exactly one enclosure character // If we find the enclosure doubled, we consider it escaped. // --> "" is converted to " later on. // else if (data.enclosure != null && data.enclosureMatcher.matchesPattern(data.byteBuffer, data.endBuffer, data.enclosure)) { enclosureFound = true; boolean keepGoing; do { if (data.increaseEndBuffer()) { enclosureFound = false; break; } keepGoing = !data.enclosureMatcher.matchesPattern(data.byteBuffer, data.endBuffer, data.enclosure); if (!keepGoing) { // We found an enclosure character. // Read another byte... if (data.increaseEndBuffer()) { enclosureFound = false; break; } // If this character is also an enclosure, we can consider the enclosure "escaped". // As such, if this is an enclosure, we keep going... // keepGoing = data.enclosureMatcher.matchesPattern(data.byteBuffer, data.endBuffer, data.enclosure); if (keepGoing) { escapedEnclosureFound++; } else { /** * <pre> * @author m72626 * fix for customer issue. * after last enclosure there must be either field end or row end otherwise enclosure is field content. * Example: * EMPNAME, COMPANY * 'emp'aa','comab' * 'empbb','com'cd' * Here enclosure after emp(emp') and after com(com') are not the last enclosures * * </pre> */ keepGoing = !(data.delimiterMatcher.matchesPattern(data.byteBuffer, data.endBuffer, data.delimiter) || data.crLfMatcher.isReturn(data.byteBuffer, data.endBuffer) || data.crLfMatcher.isLineFeed(data.byteBuffer, data.endBuffer)); } } } while (keepGoing); // Did we reach the end of the buffer? // if (data.endBuffer >= data.bufferSize) { newLineFound = true; // consider it a newline to break out of the upper while loop newLines += 2; // to remove the enclosures in case of missing newline on last line. endOfBuffer = true; break; } } else { data.endBuffer++; data.totalBytesRead++; if (checkBufferSize()) { if (data.endBuffer >= data.bufferSize) { newLineFound = true; break; } } } } // If we're still here, we found a delimiter.. // Since the starting point never changed really, we just can grab range: // // [startBuffer-endBuffer[ // // This is the part we want. // data.byteBuffer[data.startBuffer] // int length = calculateFieldLength(newLineFound, newLines, enclosureFound, endOfBuffer); byte[] field = new byte[length]; System.arraycopy(data.byteBuffer, data.startBuffer, field, 0, length); // Did we have any escaped characters in there? // if (escapedEnclosureFound > 0) { if (log.isRowLevel()) { logRowlevel("Escaped enclosures found in " + new String(field, Charset.defaultCharset())); } field = data.removeEscapedEnclosures(field, escapedEnclosureFound); } if (doConversions) { if (meta.isLazyConversionActive()) { outputRowData[outputIndex++] = field; } else { // We're not lazy so we convert the data right here and now. // The convert object uses binary storage as such we just have to ask the native type from it. // That will do the actual conversion. // ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(outputIndex); try { outputRowData[outputIndex++] = sourceValueMeta.convertBinaryStringToNativeType(field); } catch (KettleValueException e) { // There was a conversion error, // outputRowData[outputIndex++] = null; if (conversionExceptions == null) { conversionExceptions = new ArrayList<Exception>( MolapCommonConstants.CONSTANT_SIZE_TEN); exceptionFields = new ArrayList<ValueMetaInterface>( MolapCommonConstants.CONSTANT_SIZE_TEN); } conversionExceptions.add(e); exceptionFields.add(sourceValueMeta); } } } else { outputRowData[outputIndex++] = null; // nothing for the header, no conversions here. } // OK, move on to the next field... if (!newLineFound) { data.endBuffer++; data.totalBytesRead++; } data.startBuffer = data.endBuffer; } // See if we reached the end of the line. // If not, we need to skip the remaining items on the line until the next newline... // if (!newLineFound && !checkBufferSize()) { do { data.endBuffer++; data.totalBytesRead++; if (checkBufferSize()) { break; // nothing more to read. } // HANDLE: if we're using quoting we might be dealing with a very dirty file with quoted newlines in trailing fields. (imagine that) // In that particular case we want to use the same logic we use above (refactored a bit) to skip these fields. } while (!data.crLfMatcher.isReturn(data.byteBuffer, data.endBuffer) && !data.crLfMatcher.isLineFeed(data.byteBuffer, data.endBuffer)); if (!checkBufferSize()) { while (data.crLfMatcher.isReturn(data.byteBuffer, data.endBuffer) || data.crLfMatcher.isLineFeed(data.byteBuffer, data.endBuffer)) { data.endBuffer++; data.totalBytesRead++; if (checkBufferSize()) { break; // nothing more to read. } } } // Make sure we start at the right position the next time around. data.startBuffer = data.endBuffer; } // Optionally add the current filename to the mix as well... // if (meta.isIncludingFilename() && !Const.isEmpty(meta.getFilenameField())) { if (meta.isLazyConversionActive()) { outputRowData[data.filenameFieldIndex] = data.binaryFilename; } else { outputRowData[data.filenameFieldIndex] = data.filenames[data.filenr - 1]; } } addRowDetails(outputRowData); incrementLinesInput(); if (conversionExceptions != null && conversionExceptions.size() > 0) { // Forward the first exception // throw new KettleConversionException("There were " + conversionExceptions.size() + " conversion errors on line " + getLinesInput(), conversionExceptions, exceptionFields, outputRowData); } return outputRowData; } catch (KettleConversionException e) { throw e; } catch (Exception e) { throw new KettleFileException("Exception reading line using NIO", e); } } protected void addRowDetails(Object[] outputRowData) { if (data.isAddingRowNumber) { outputRowData[data.rownumFieldIndex] = Long.valueOf(data.rowNumber++); //new Long(data.rowNumber++); } } private int calculateFieldLength(boolean newLineFound, int newLines, boolean enclosureFound, boolean endOfBuffer) { int length = data.endBuffer - data.startBuffer; if (newLineFound) { length -= newLines; if (length <= 0) { length = 0; } if (endOfBuffer) { data.startBuffer++; // offset for the enclosure in last field before EOF } } if (enclosureFound) { data.startBuffer++; length -= 2; if (length <= 0) { length = 0; } } if (length <= 0) { length = 0; } if (data.encodingType != EncodingType.SINGLE) { length--; } return length; } public boolean init(StepMetaInterface smi, StepDataInterface sdi) { meta = (CsvInputMeta) smi; data = (CsvInputData) sdi; if (super.init(smi, sdi)) { data.preferredBufferSize = Integer.parseInt(environmentSubstitute(meta.getBufferSize())); // If the step doesn't have any previous steps, we just get the filename. // Otherwise, we'll grab the list of filenames later... // if (getTransMeta().findNrPrevSteps(getStepMeta()) == 0) { String filename = environmentSubstitute(meta.getFilename()); if (Const.isEmpty(filename)) { logError(BaseMessages.getString(PKG, "CsvInput.MissingFilename.Message")); //$NON-NLS-1$ return false; } data.filenames = new String[] { filename, }; } else { data.filenames = null; data.filenr = 0; } data.totalBytesRead = 0L; data.encodingType = EncodingType.guessEncodingType(meta.getEncoding()); // PDI-2489 - set the delimiter byte value to the code point of the // character as represented in the input file's encoding try { data.delimiter = data.encodingType.getBytes(environmentSubstitute(meta.getDelimiter()), meta.getEncoding()); if (Const.isEmpty(meta.getEnclosure())) { data.enclosure = null; } else { data.enclosure = data.encodingType.getBytes(environmentSubstitute(meta.getEnclosure()), meta.getEncoding()); } } catch (UnsupportedEncodingException e) { logError(BaseMessages.getString(PKG, "CsvInput.BadEncoding.Message"), e); //$NON-NLS-1$ return false; } data.isAddingRowNumber = !Const.isEmpty(meta.getRowNumField()); // Handle parallel reading capabilities... // // data.stopReading = false; if (meta.isRunningInParallel()) { // data.stepNumber = getUniqueStepNrAcrossSlaves(); data.totalNumberOfSteps = getUniqueStepCountAcrossSlaves(); // We are not handling a single file, but possibly a list of files... // As such, the fair thing to do is calculate the total size of the files // Then read the required block. // // data.fileSizes = new ArrayList<Long>(MolapCommonConstants.CONSTANT_SIZE_TEN); // data.totalFileSize = 0L; } // Set the most efficient pattern matcher to match the delimiter. // if (data.delimiter.length == 1) { data.delimiterMatcher = new SingleBytePatternMatcher(); } else { data.delimiterMatcher = new MultiBytePatternMatcher(); } // Set the most efficient pattern matcher to match the enclosure. // if (data.enclosure == null) { data.enclosureMatcher = new EmptyPatternMatcher(); } else { if (data.enclosure.length == 1) { data.enclosureMatcher = new SingleBytePatternMatcher(); } else { data.enclosureMatcher = new MultiBytePatternMatcher(); } } switch (data.encodingType) { case DOUBLE_BIG_ENDIAN: data.crLfMatcher = new MultiByteBigCrLfMatcher(); break; case DOUBLE_LITTLE_ENDIAN: data.crLfMatcher = new MultiByteLittleCrLfMatcher(); break; default: data.crLfMatcher = new SingleByteCrLfMatcher(); break; } return true; } return false; } public void closeFile() throws KettleException { try { if (data.bufferedInputStream != null) { data.bufferedInputStream.close(); } // if (data.fis!=null) { // data.fis.close(); // } } catch (IOException e) { throw new KettleException("Unable to close file channel for file '" + data.filenames[data.filenr - 1], e); } } /** * This method is borrowed from TextFileInput * * @param log * @param line * @param delimiter * @param enclosure * @param escapeCharacter * @return * @throws KettleException */ public static final String[] guessStringsFromLine(LogChannelInterface log, String line, String delimiter, String enclosure, String escapeCharacter) throws KettleException { List<String> strings = new ArrayList<String>(MolapCommonConstants.CONSTANT_SIZE_TEN); // int fieldnr; String pol; // piece of line try { if (line == null) { return null; } // Split string in pieces, only for CSV! // fieldnr = 0; int pos = 0; int length = line.length(); boolean dencl = false; int lenEncl = (enclosure == null ? 0 : enclosure.length()); int lenEsc = (escapeCharacter == null ? 0 : escapeCharacter.length()); while (pos < length) { int from = pos; int next; boolean enclFound; boolean containsEscapedEnclosures = false; boolean containsEscapedSeparators = false; // Is the field beginning with an enclosure? // "aa;aa";123;"aaa-aaa";000;... if (lenEncl > 0 && line.substring(from, from + lenEncl).equalsIgnoreCase(enclosure)) { if (log.isRowLevel()) { log.logRowlevel(BaseMessages.getString(PKG, "CsvInput.Log.ConvertLineToRowTitle"), //$NON-NLS-1$ BaseMessages.getString(PKG, "CsvInput.Log.ConvertLineToRow", //$NON-NLS-1$ line.substring(from, from + lenEncl))); } enclFound = true; int p = from + lenEncl; boolean isEnclosure = lenEncl > 0 && p + lenEncl < length && line.substring(p, p + lenEncl).equalsIgnoreCase(enclosure); boolean isEscape = lenEsc > 0 && p + lenEsc < length && line.substring(p, p + lenEsc).equalsIgnoreCase(escapeCharacter); boolean enclosureAfter = false; // Is it really an enclosure? See if it's not repeated twice or escaped! if ((isEnclosure || isEscape) && p < length - 1) { String strnext = line.substring(p + lenEncl, p + 2 * lenEncl); if (strnext.equalsIgnoreCase(enclosure)) { p++; enclosureAfter = true; dencl = true; // Remember to replace them later on! if (isEscape) { containsEscapedEnclosures = true; } } } // Look for a closing enclosure! while ((!isEnclosure || enclosureAfter) && p < line.length()) { p++; enclosureAfter = false; isEnclosure = lenEncl > 0 && p + lenEncl < length && line.substring(p, p + lenEncl).equals(enclosure); isEscape = lenEsc > 0 && p + lenEsc < length && line.substring(p, p + lenEsc).equals(escapeCharacter); // Is it really an enclosure? See if it's not repeated twice or escaped! if ((isEnclosure || isEscape) && p < length - 1) // Is { String strnext = line.substring(p + lenEncl, p + 2 * lenEncl); if (strnext.equals(enclosure)) { p++; enclosureAfter = true; dencl = true; // Remember to replace them later on! if (isEscape) { containsEscapedEnclosures = true; // remember } } } } if (p >= length) { next = p; } else { next = p + lenEncl; } if (log.isRowLevel()) { log.logRowlevel(BaseMessages.getString(PKG, "CsvInput.Log.ConvertLineToRowTitle"), //$NON-NLS-1$ BaseMessages.getString(PKG, "CsvInput.Log.EndOfEnclosure", "" + p)); //$NON-NLS-1$ //$NON-NLS-2$ } } else { enclFound = false; boolean found = false; int startpoint = from; int tries = 1; do { next = line.indexOf(delimiter, startpoint); // See if this position is preceded by an escape character. if (lenEsc > 0 && next - lenEsc > 0) { String before = line.substring(next - lenEsc, next); if (escapeCharacter != null && escapeCharacter.equals(before)) { // take the next separator, this one is escaped... startpoint = next + 1; tries++; containsEscapedSeparators = true; } else { found = true; } } else { found = true; } } while (!found && next >= 0); } if (next == -1) { next = length; } if (enclFound) { pol = line.substring(from + lenEncl, next - lenEncl); if (log.isRowLevel()) { log.logRowlevel(BaseMessages.getString(PKG, "CsvInput.Log.ConvertLineToRowTitle"), //$NON-NLS-1$ BaseMessages.getString(PKG, "CsvInput.Log.EnclosureFieldFound", "" + pol)); //$NON-NLS-1$ //$NON-NLS-2$ } } else { pol = line.substring(from, next); if (log.isRowLevel()) { log.logRowlevel(BaseMessages.getString(PKG, "CsvInput.Log.ConvertLineToRowTitle"), //$NON-NLS-1$ BaseMessages.getString(PKG, "CsvInput.Log.NormalFieldFound", "" + pol)); //$NON-NLS-1$ //$NON-NLS-2$ } } if (dencl) { StringBuilder sbpol = new StringBuilder(pol); int idx = sbpol.indexOf(enclosure + enclosure); while (idx >= 0) { sbpol.delete(idx, idx + (enclosure == null ? 0 : enclosure.length())); idx = sbpol.indexOf(enclosure + enclosure); } pol = sbpol.toString(); } // replace the escaped enclosures with enclosures... if (containsEscapedEnclosures) { String replace = escapeCharacter + enclosure; String replaceWith = enclosure; pol = Const.replace(pol, replace, replaceWith); } //replace the escaped separators with separators... if (containsEscapedSeparators) { String replace = escapeCharacter + delimiter; String replaceWith = delimiter; pol = Const.replace(pol, replace, replaceWith); } // Now add pol to the strings found! strings.add(pol); pos = next + delimiter.length(); // fieldnr++; } if (pos == length) { if (log.isRowLevel()) { log.logRowlevel(BaseMessages.getString(PKG, "CsvInput.Log.ConvertLineToRowTitle"), //$NON-NLS-1$ BaseMessages.getString(PKG, "CsvInput.Log.EndOfEmptyLineFound")); //$NON-NLS-1$ } strings.add(""); //$NON-NLS-1$ // fieldnr++; } } catch (Exception e) { throw new KettleException( BaseMessages.getString(PKG, "CsvInput.Log.Error.ErrorConvertingLine", e.toString()), e); //$NON-NLS-1$ } return strings.toArray(new String[strings.size()]); } public boolean isWaitingForData() { return true; } }