com.panet.imeta.trans.steps.sort.SortRows.java Source code

Java tutorial

Introduction

Here is the source code for com.panet.imeta.trans.steps.sort.SortRows.java

Source

/* Copyright (c) 2007 Pentaho Corporation.  All rights reserved. 
* This software was developed by Pentaho Corporation and is provided under the terms 
* of the GNU Lesser General Public License, Version 2.1. You may not use 
* this file except in compliance with the license. If you need a copy of the license, 
* please go to http://www.gnu.org/licenses/lgpl-2.1.txt. The Original Code is Pentaho 
* Data Integration.  The Initial Developer is Pentaho Corporation.
*
* Software distributed under the GNU Lesser Public License is distributed on an "AS IS" 
* basis, WITHOUT WARRANTY OF ANY KIND, either express or  implied. Please refer to 
* the license for the specific language governing your rights and limitations.*/

package com.panet.imeta.trans.steps.sort;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.vfs.FileObject;

import com.panet.imeta.core.Const;
import com.panet.imeta.core.exception.KettleException;
import com.panet.imeta.core.exception.KettleFileException;
import com.panet.imeta.core.exception.KettleValueException;
import com.panet.imeta.core.row.RowMetaInterface;
import com.panet.imeta.core.row.ValueMetaInterface;
import com.panet.imeta.core.vfs.KettleVFS;
import com.panet.imeta.trans.Trans;
import com.panet.imeta.trans.TransMeta;
import com.panet.imeta.trans.step.BaseStep;
import com.panet.imeta.trans.step.StepDataInterface;
import com.panet.imeta.trans.step.StepInterface;
import com.panet.imeta.trans.step.StepMeta;
import com.panet.imeta.trans.step.StepMetaInterface;

/**
 * Sort the rows in the input-streams based on certain criteria
 * 
 * @author Matt
 * @since 29-apr-2003
 */
public class SortRows extends BaseStep implements StepInterface {
    private SortRowsMeta meta;
    private SortRowsData data;

    public SortRows(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);

        meta = (SortRowsMeta) getStepMeta().getStepMetaInterface();
        data = (SortRowsData) stepDataInterface;
    }

    private boolean addBuffer(RowMetaInterface rowMeta, Object[] r) throws KettleException {
        if (r != null) {
            // Do we need to convert binary string keys?
            //
            for (int i = 0; i < data.fieldnrs.length; i++) {
                if (data.convertKeysToNative[i]) {
                    int index = data.fieldnrs[i];
                    r[index] = rowMeta.getValueMeta(index).convertBinaryStringToNativeType((byte[]) r[index]);
                }
            }

            // Save row
            // 
            data.buffer.add(r);
        }
        if (data.files.size() == 0 && r == null) // No more records: sort buffer
        {
            quickSort(data.buffer);
        }

        // Check the free memory every 1000 rows...
        //
        data.freeCounter++;
        if (data.sortSize <= 0 && data.freeCounter >= 1000) {
            data.freeMemoryPct = Const.getPercentageFreeMemory();
            data.freeCounter = 0;

            if (log.isDetailed()) {
                data.memoryReporting++;
                if (data.memoryReporting >= 10) {
                    logDetailed("Available memory : " + data.freeMemoryPct + "%");
                    data.memoryReporting = 0;
                }
            }
        }

        boolean doSort = data.buffer.size() == data.sortSize; // Buffer is full: sort & dump to disk
        doSort |= data.files.size() > 0 && r == null && data.buffer.size() > 0; // No more records: join from disk 
        doSort |= data.freeMemoryPctLimit > 0 && data.freeMemoryPct < data.freeMemoryPctLimit
                && data.buffer.size() >= data.minSortSize;

        // time to sort the buffer and write the data to disk...
        //
        if (doSort) {
            // First sort the rows in buffer[]
            quickSort(data.buffer);

            // Then write them to disk...
            DataOutputStream dos;
            GZIPOutputStream gzos;
            int p;
            Object[] previousRow = null;

            try {
                FileObject fileObject = KettleVFS.createTempFile(meta.getPrefix(), ".tmp",
                        environmentSubstitute(meta.getDirectory()));

                data.files.add(fileObject); // Remember the files!
                OutputStream outputStream = KettleVFS.getOutputStream(fileObject, false);
                if (data.compressFiles) {
                    gzos = new GZIPOutputStream(new BufferedOutputStream(outputStream));
                    dos = new DataOutputStream(gzos);
                } else {
                    dos = new DataOutputStream(new BufferedOutputStream(outputStream, 500000));
                    gzos = null;
                }

                // Just write the data, nothing else
                if (meta.isOnlyPassingUniqueRows()) {
                    int index = 0;
                    while (index < data.buffer.size()) {
                        Object[] row = data.buffer.get(index);
                        if (previousRow != null) {
                            int result = data.outputRowMeta.compare(row, previousRow, data.fieldnrs);
                            if (result == 0) {
                                data.buffer.remove(index); // remove this duplicate element as requested
                                if (log.isRowLevel())
                                    logRowlevel("Duplicate row removed: " + data.outputRowMeta.getString(row));
                            } else {
                                index++;
                            }
                        } else {
                            index++;
                        }
                        previousRow = row;
                    }
                }

                // How many records do we have left?
                data.bufferSizes.add(data.buffer.size());

                for (p = 0; p < data.buffer.size(); p++) {
                    data.outputRowMeta.writeData(dos, data.buffer.get(p));
                }

                if (data.sortSize < 0) {
                    if (data.buffer.size() > data.minSortSize) {
                        data.minSortSize = data.buffer.size(); // if we did it once, we can do it again.

                        // Memory usage goes up over time, even with garbage collection
                        // We need pointers, file handles, etc.
                        // As such, we're going to lower the min sort size a bit
                        //
                        data.minSortSize = (int) Math.round((double) data.minSortSize * 0.90);
                    }
                }

                // Clear the list
                data.buffer.clear();

                // Close temp-file
                dos.close(); // close data stream
                if (gzos != null) {
                    gzos.close(); // close gzip stream
                }
                outputStream.close(); // close file stream

                // How much memory do we have left?
                //
                data.freeMemoryPct = Const.getPercentageFreeMemory();
                data.freeCounter = 0;
                if (data.sortSize <= 0) {
                    if (log.isDetailed())
                        logDetailed("Available memory : " + data.freeMemoryPct + "%");
                }

            } catch (Exception e) {
                throw new KettleException("Error processing temp-file!", e);
            }

            data.getBufferIndex = 0;
        }

        return true;
    }

    private Object[] getBuffer() throws KettleValueException {
        Object[] retval;

        // Open all files at once and read one row from each file...
        if (data.files.size() > 0 && (data.dis.size() == 0 || data.fis.size() == 0)) {
            if (log.isBasic())
                logBasic("Opening " + data.files.size() + " tmp-files...");

            try {
                for (int f = 0; f < data.files.size() && !isStopped(); f++) {
                    FileObject fileObject = (FileObject) data.files.get(f);
                    String filename = KettleVFS.getFilename(fileObject);
                    if (log.isDetailed())
                        logDetailed("Opening tmp-file: [" + filename + "]");
                    InputStream fi = KettleVFS.getInputStream(fileObject);
                    DataInputStream di;
                    data.fis.add(fi);
                    if (data.compressFiles) {
                        GZIPInputStream gzfi = new GZIPInputStream(new BufferedInputStream(fi));
                        di = new DataInputStream(gzfi);
                        data.gzis.add(gzfi);
                    } else {
                        di = new DataInputStream(new BufferedInputStream(fi, 50000));
                    }
                    data.dis.add(di);

                    // How long is the buffer?
                    int buffersize = data.bufferSizes.get(f);

                    if (log.isDetailed())
                        logDetailed("[" + filename + "] expecting " + buffersize + " rows...");

                    if (buffersize > 0) {
                        Object[] row = (Object[]) data.outputRowMeta.readData(di);
                        data.rowbuffer.add(row); // new row from input stream
                        data.tempRows.add(new RowTempFile(row, f));
                    }
                }

                // Sort the data row buffer
                Collections.sort(data.tempRows, data.comparator);
            } catch (Exception e) {
                logError("Error reading back tmp-files : " + e.toString());
                logError(Const.getStackTracker(e));
            }
        }

        if (data.files.size() == 0) {
            if (data.getBufferIndex < data.buffer.size()) {
                retval = (Object[]) data.buffer.get(data.getBufferIndex);
                data.getBufferIndex++;
            } else {
                retval = null;
            }
        } else {
            if (data.rowbuffer.size() == 0) {
                retval = null;
            } else {
                // We now have "filenr" rows waiting: which one is the smallest?
                //
                if (log.isRowLevel()) {
                    for (int i = 0; i < data.rowbuffer.size() && !isStopped(); i++) {
                        Object[] b = (Object[]) data.rowbuffer.get(i);
                        logRowlevel("--BR#" + i + ": " + data.outputRowMeta.getString(b));
                    }
                }

                RowTempFile rowTempFile = data.tempRows.remove(0);
                retval = rowTempFile.row;
                int smallest = rowTempFile.fileNumber;

                // now get another Row for position smallest

                FileObject file = (FileObject) data.files.get(smallest);
                DataInputStream di = (DataInputStream) data.dis.get(smallest);
                InputStream fi = (InputStream) data.fis.get(smallest);
                GZIPInputStream gzfi = (data.compressFiles) ? (GZIPInputStream) data.gzis.get(smallest) : null;

                try {
                    Object[] row2 = (Object[]) data.outputRowMeta.readData(di);
                    RowTempFile extra = new RowTempFile(row2, smallest);

                    int index = Collections.binarySearch(data.tempRows, extra, data.comparator);
                    if (index < 0) {
                        data.tempRows.add(index * (-1) - 1, extra);
                    } else {
                        data.tempRows.add(index, extra);
                    }
                } catch (KettleFileException fe) // empty file or EOF mostly
                {
                    try {
                        di.close();
                        fi.close();
                        if (gzfi != null)
                            gzfi.close();
                        file.delete();
                    } catch (IOException e) {
                        logError("Unable to close/delete file #" + smallest + " --> " + file.toString());
                        setErrors(1);
                        stopAll();
                        return null;
                    }

                    data.files.remove(smallest);
                    data.dis.remove(smallest);
                    data.fis.remove(smallest);

                    if (gzfi != null)
                        data.gzis.remove(smallest);

                    // Also update all file numbers in in data.tempRows if they are larger than smallest.
                    //
                    for (RowTempFile rtf : data.tempRows) {
                        if (rtf.fileNumber > smallest)
                            rtf.fileNumber--;
                    }

                } catch (SocketTimeoutException e) {
                    throw new KettleValueException(e); // should never happen on local files
                }
            }
        }
        return retval;
    }

    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
        boolean err = true;
        int i;

        Object[] r = getRow(); // get row from rowset, wait for our turn, indicate busy!

        // initialize 
        if (first && r != null) {
            first = false;
            data.convertKeysToNative = new boolean[meta.getFieldName().length];
            data.fieldnrs = new int[meta.getFieldName().length];
            for (i = 0; i < meta.getFieldName().length; i++) {
                data.fieldnrs[i] = getInputRowMeta().indexOfValue(meta.getFieldName()[i]);
                if (data.fieldnrs[i] < 0) {
                    logError("Sort field [" + meta.getFieldName()[i] + "] not found!");
                    setOutputDone();
                    return false;
                }
                data.convertKeysToNative[i] = getInputRowMeta().getValueMeta(data.fieldnrs[i])
                        .isStorageBinaryString();
            }

            // Metadata
            data.outputRowMeta = getInputRowMeta().clone();
            meta.getFields(data.outputRowMeta, getStepname(), null, null, this);
        }

        err = addBuffer(getInputRowMeta(), r);
        if (!err) {
            setOutputDone(); // signal receiver we're finished.
            return false;
        }

        if (r == null) // no more input to be expected...
        {
            // Now we can start the output!
            r = getBuffer();
            Object[] previousRow = null;
            while (r != null && !isStopped()) {
                if (log.isRowLevel())
                    logRowlevel("Read row: " + getInputRowMeta().getString(r));

                // Do another verification pass for unique rows...
                //
                if (meta.isOnlyPassingUniqueRows()) {
                    if (previousRow != null) {
                        // See if this row is the same as the previous one as far as the keys are concerned.
                        // If so, we don't put forward this row.
                        int result = data.outputRowMeta.compare(r, previousRow, data.fieldnrs);
                        if (result != 0) {
                            putRow(data.outputRowMeta, r); // copy row to possible alternate rowset(s).
                        }
                    } else {
                        putRow(data.outputRowMeta, r); // copy row to possible alternate rowset(s).
                    }
                    previousRow = r;
                } else {
                    putRow(data.outputRowMeta, r); // copy row to possible alternate rowset(s).
                }

                r = getBuffer();
            }

            setOutputDone(); // signal receiver we're finished.
            return false;
        }

        if (checkFeedback(getLinesRead())) {
            if (log.isBasic())
                logBasic("Linenr " + getLinesRead());
        }

        return true;
    }

    public boolean init(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (SortRowsMeta) smi;
        data = (SortRowsData) sdi;

        if (super.init(smi, sdi)) {
            data.sortSize = Const.toInt(environmentSubstitute(meta.getSortSize()), -1);
            data.freeMemoryPctLimit = Const.toInt(meta.getFreeMemoryLimit(), -1);
            if (data.sortSize <= 0 && data.freeMemoryPctLimit <= 0) {
                // Prefer the memory limit as it should never fail
                //
                data.freeMemoryPctLimit = 25;
            }

            if (data.sortSize > 0) {
                data.buffer = new ArrayList<Object[]>(data.sortSize);
            } else {
                data.buffer = new ArrayList<Object[]>(5000);
            }

            data.compressFiles = getBooleanValueOfVariable(meta.getCompressFilesVariable(),
                    meta.getCompressFiles());

            data.comparator = new Comparator<RowTempFile>() {
                public int compare(RowTempFile o1, RowTempFile o2) {
                    try {
                        return data.outputRowMeta.compare(o1.row, o2.row, data.fieldnrs);
                    } catch (KettleValueException e) {
                        logError("Error comparing rows: " + e.toString());
                        return 0;
                    }
                }
            };

            // Add init code here.

            if (data.sortSize > 0) {
                data.rowbuffer = new ArrayList<Object[]>(data.sortSize);
            } else {
                data.rowbuffer = new ArrayList<Object[]>();
            }
            data.tempRows = new ArrayList<RowTempFile>();

            data.minSortSize = 5000;

            return true;
        }
        return false;
    }

    /** 
     * Sort the entire vector, if it is not empty.
     */
    public void quickSort(List<Object[]> elements) {
        if (log.isDetailed())
            logDetailed("Starting quickSort algorithm...");
        if (elements.size() > 0) {
            Collections.sort(elements, new Comparator<Object[]>() {
                public int compare(Object[] o1, Object[] o2) {
                    Object[] r1 = (Object[]) o1;
                    Object[] r2 = (Object[]) o2;

                    try {
                        return data.outputRowMeta.compare(r1, r2, data.fieldnrs);
                    } catch (KettleValueException e) {
                        logError("Error comparing rows: " + e.toString());
                        return 0;
                    }
                }
            });
            long nrConversions = 0L;
            for (ValueMetaInterface valueMeta : data.outputRowMeta.getValueMetaList()) {
                nrConversions += valueMeta.getNumberOfBinaryStringConversions();
                valueMeta.setNumberOfBinaryStringConversions(0L);
            }
            if (log.isDetailed())
                logDetailed("The number of binary string to data type conversions done in this sort block is "
                        + nrConversions);
        }
        if (log.isDetailed())
            logDetailed("QuickSort algorithm has finished.");
    }

    //
    // Run is were the action happens!
    public void run() {
        BaseStep.runStepThread(this, meta, data);
    }
}