org.pentaho.di.trans.steps.fixedinput.FixedInput.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.di.trans.steps.fixedinput.FixedInput.java

Source

/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans.steps.fixedinput;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
import java.nio.ByteBuffer;

import org.apache.commons.io.FileUtils;
import org.apache.commons.vfs.FileObject;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;

/**
 * Read a simple fixed width file Just output fields found in the file...
 *
 * @author Matt
 * @since 2007-07-06
 */
public class FixedInput extends BaseStep implements StepInterface {
    private static Class<?> PKG = FixedInputMeta.class; // for i18n purposes, needed by Translator2!!

    private FixedInputMeta meta;
    private FixedInputData data;

    public FixedInput(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);
    }

    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
        meta = (FixedInputMeta) smi;
        data = (FixedInputData) sdi;

        if (first) {
            first = false;

            data.outputRowMeta = new RowMeta();
            meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore);

            // The conversion logic for when the lazy conversion is turned of is simple:
            // Pretend it's a lazy conversion object anyway and get the native type during conversion.
            //
            data.convertRowMeta = data.outputRowMeta.clone();
            for (ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList()) {
                valueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_BINARY_STRING);
            }

            if (meta.isHeaderPresent()) {
                readOneRow(false); // skip this row.
            }
        }

        Object[] outputRowData = readOneRow(true);
        if (outputRowData == null) { // no more input to be expected...
            setOutputDone();
            return false;
        }

        putRow(data.outputRowMeta, outputRowData); // copy row to possible alternate rowset(s).

        if (checkFeedback(getLinesInput())) {
            logBasic(BaseMessages.getString(PKG, "FixedInput.Log.LineNumber", Long.toString(getLinesInput())));
        }

        return true;
    }

    /**
     * Read a single row of data from the file...
     *
     * @param doConversions
     *          if you want to do conversions, set to false for the header row.
     * @return a row of data...
     * @throws KettleException
     */
    private Object[] readOneRow(boolean doConversions) throws KettleException {

        try {

            // See if we need to call it a day...
            //
            if (meta.isRunningInParallel()) {
                if (getLinesInput() >= data.rowsToRead) {
                    return null; // We're done. The rest is for the other steps in the cluster
                }
            }

            Object[] outputRowData = RowDataUtil.allocateRowData(data.convertRowMeta.size());
            int outputIndex = 0;

            // The strategy is as follows...
            // We read a block of byte[] from the file.
            //
            // Then we scan that block of data.
            // We keep a byte[] that we extend if needed..
            // At the end of the block we read another, etc.
            //
            // Let's start by looking where we left off reading.
            //

            if (data.stopReading) {
                return null;
            }

            FixedFileInputField[] fieldDefinitions = meta.getFieldDefinition();
            for (int i = 0; i < fieldDefinitions.length; i++) {

                int fieldWidth = fieldDefinitions[i].getWidth();
                data.endBuffer = data.startBuffer + fieldWidth;
                if (data.endBuffer > data.bufferSize) {
                    // Oops, we need to read more data...
                    // Better resize this before we read other things in it...
                    //
                    data.resizeByteBuffer();

                    // Also read another chunk of data, now that we have the space for it...
                    // Ignore EOF, there might be other stuff in the buffer.
                    //
                    data.readBufferFromFile();
                }

                // re-verify the buffer after we tried to read extra data from file...
                //
                if (data.endBuffer > data.bufferSize) {
                    // still a problem?
                    // We hit an EOF and are trying to read beyond the EOF...

                    // If we are on the first field and there
                    // is nothing left in the buffer, don't return
                    // a row because we're done.
                    if ((0 == i) && data.bufferSize <= 0) {
                        return null;
                    }

                    // This is the last record of data in the file.
                    data.stopReading = true;

                    // Just take what's left for the current field.
                    fieldWidth = data.bufferSize;
                }
                byte[] field = new byte[fieldWidth];
                System.arraycopy(data.byteBuffer, data.startBuffer, field, 0, fieldWidth);

                if (doConversions) {
                    if (meta.isLazyConversionActive()) {
                        outputRowData[outputIndex++] = field;
                    } else {
                        // We're not lazy so we convert the data right here and now.
                        // The convert object uses binary storage as such we just have to ask the native type from it.
                        // That will do the actual conversion.
                        //
                        ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(outputIndex);
                        outputRowData[outputIndex++] = sourceValueMeta.convertBinaryStringToNativeType(field);
                    }
                } else {
                    outputRowData[outputIndex++] = null; // nothing for the header, no conversions here.
                }

                // OK, onto the next field...
                //
                data.startBuffer = data.endBuffer;
            }

            // Now that we have all the data, see if there are any linefeed characters to remove from the buffer...
            //
            if (meta.isLineFeedPresent()) {

                data.endBuffer += 2;

                if (data.endBuffer >= data.bufferSize) {
                    // Oops, we need to read more data...
                    // Better resize this before we read other things in it...
                    //
                    data.resizeByteBuffer();

                    // Also read another chunk of data, now that we have the space for it...
                    data.readBufferFromFile();
                }

                // CR + Line feed in the worst case.
                //
                if (data.byteBuffer[data.startBuffer] == '\n' || data.byteBuffer[data.startBuffer] == '\r') {

                    data.startBuffer++;

                    if (data.byteBuffer[data.startBuffer] == '\n' || data.byteBuffer[data.startBuffer] == '\r') {

                        data.startBuffer++;
                    }
                }
                data.endBuffer = data.startBuffer;
            }

            incrementLinesInput();
            return outputRowData;
        } catch (Exception e) {
            throw new KettleFileException("Exception reading line using NIO: " + e.toString(), e);
        }

    }

    private FileInputStream getFileInputStream(URL url) throws FileNotFoundException {
        return new FileInputStream(FileUtils.toFile(url));
    }

    public boolean init(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (FixedInputMeta) smi;
        data = (FixedInputData) sdi;

        if (super.init(smi, sdi)) {
            try {
                data.preferredBufferSize = Integer.parseInt(environmentSubstitute(meta.getBufferSize()));
                data.lineWidth = Integer.parseInt(environmentSubstitute(meta.getLineWidth()));
                data.filename = environmentSubstitute(meta.getFilename());

                if (Const.isEmpty(data.filename)) {
                    logError(BaseMessages.getString(PKG, "FixedInput.MissingFilename.Message"));
                    return false;
                }

                FileObject fileObject = KettleVFS.getFileObject(data.filename, getTransMeta());
                try {
                    data.fis = getFileInputStream(fileObject.getURL());
                    data.fc = data.fis.getChannel();
                    data.bb = ByteBuffer.allocateDirect(data.preferredBufferSize);
                } catch (IOException e) {
                    logError(e.toString());
                    return false;
                }

                // Add filename to result filenames ?
                if (meta.isAddResultFile()) {
                    ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_GENERAL, fileObject,
                            getTransMeta().getName(), toString());
                    resultFile.setComment("File was read by a Fixed input step");
                    addResultFile(resultFile);
                }

                logBasic("Opened file with name [" + data.filename + "]");

                data.stopReading = false;

                if (meta.isRunningInParallel()) {
                    data.stepNumber = getUniqueStepNrAcrossSlaves();
                    data.totalNumberOfSteps = getUniqueStepCountAcrossSlaves();
                    data.fileSize = fileObject.getContent().getSize();
                }

                // OK, now we need to skip a number of bytes in case we're doing a parallel read.
                //
                if (meta.isRunningInParallel()) {

                    int totalLineWidth = data.lineWidth + meta.getLineSeparatorLength(); // including line separator bytes
                    long nrRows = data.fileSize / totalLineWidth; // 100.000 / 100 = 1000 rows
                    long rowsToSkip = Math.round(data.stepNumber * nrRows / (double) data.totalNumberOfSteps); // 0, 333, 667
                    // 333, 667, 1000
                    long nextRowsToSkip = Math
                            .round((data.stepNumber + 1) * nrRows / (double) data.totalNumberOfSteps);
                    data.rowsToRead = nextRowsToSkip - rowsToSkip;
                    long bytesToSkip = rowsToSkip * totalLineWidth;

                    logBasic("Step #" + data.stepNumber + " is skipping " + bytesToSkip
                            + " to position in file, then it's reading " + data.rowsToRead + " rows.");

                    data.fc.position(bytesToSkip);
                }

                return true;
            } catch (Exception e) {
                logError("Error opening file '" + meta.getFilename() + "'", e);
            }
        }
        return false;
    }

    @Override
    public void dispose(StepMetaInterface smi, StepDataInterface sdi) {

        try {
            if (data.fc != null) {
                data.fc.close();
            }
            if (data.fis != null) {
                data.fis.close();
            }
        } catch (IOException e) {
            logError("Unable to close file channel for file '" + meta.getFilename() + "' : " + e.toString());
            logError(Const.getStackTracker(e));
        }

        super.dispose(smi, sdi);
    }

}