org.pentaho.di.trans.steps.cpythonscriptexecutor.CPythonScriptExecutorData.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.di.trans.steps.cpythonscriptexecutor.CPythonScriptExecutorData.java

Source

/*******************************************************************************
 * Pentaho Data Science
 * <p/>
 * Copyright (C) 2002-2015 by Pentaho : http://www.pentaho.com
 * <p/>
 * ******************************************************************************
 * <p/>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

package org.pentaho.di.trans.steps.cpythonscriptexecutor;

import org.apache.commons.vfs2.FileObject;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.RowSet;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.logging.LogChannelInterface;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.row.value.ValueMetaFactory;
import org.pentaho.di.core.variables.VariableSpace;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.step.BaseStepData;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.steps.reservoirsampling.ReservoirSamplingData;
import org.pentaho.python.PythonSession;
import org.pentaho.python.SessionException;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

/**
 * Data class for the CPythonScriptExecutor step
 *
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 */
public class CPythonScriptExecutorData extends BaseStepData implements StepDataInterface {

    private static Class<?> PKG = CPythonScriptExecutorMeta.class;

    /**
     * number of rows to randomly generate when trying to determine single pandas data frame output
     */
    protected static final int NUM_RANDOM_ROWS = 100;

    /**
     * The reservoir sampling class does not have "store all rows" behavior when the sample size is -1. Instead, it is
     * disabled entirely when sample size < 0. So to simulate this behavior we use a default size for the -1 case. If this
     * is not sufficient, then the user will have to manually set a size that is large enough. Note that Integer.MAX_VALUE
     * is not used because the reservoir class allocates an array list of size equal to the sample size.
     */
    protected static final int DEFAULT_RESERVOIR_SAMPLING_STORE_ALL_ROWS_SIZE = 100000;

    /**
     * Holds the full output row meta data (including any incoming fields that are copied to the outgoing)
     */
    public RowMetaInterface m_outputRowMeta;

    /**
     * Holds output row meta for fields only generated from script execution
     */
    public RowMetaInterface m_scriptOnlyOutputRowMeta;

    /**
     * Holds the row meta for all incoming fields that are getting copied to the output
     */
    public RowMetaInterface m_incomingFieldsIncludedInOutputRowMeta;

    /**
     * The incoming row sets
     */
    protected List<RowSet> m_incomingRowSets;

    /**
     * The list of processed row sets during the getRow of each incoming input stream
     */
    protected boolean[] m_finishedRowSets;

    /**
     * A collection of the frame buffers per input frame
     */
    protected List<List<Object[]>> m_frameBuffers = new ArrayList<List<Object[]>>();

    /**
     * Holds the reservoir samplers
     */
    protected List<ReservoirSamplingData> m_reservoirSamplers = new ArrayList<ReservoirSamplingData>();

    /**
     * Holds the row meta associated with each frame buffer or reservoir sampler
     */
    protected List<RowMetaInterface> m_infoMetas = new ArrayList<RowMetaInterface>();

    /**
     * A index used to reference a line for the incoming rows when we are processing row by row with reservoir sampling
     * active.
     */
    protected int m_rowByRowReservoirSampleIndex;

    /**
     * Batch size
     */
    protected int m_batchSize = 1000;

    /**
     * Reservoir Samplers size
     */
    protected int m_reservoirSamplersSize;

    /**
     * True if input stream values should be copied to the output stream
     */
    protected boolean m_includeInputAsOutput;

    /**
     * The script to run
     */
    protected String m_script;

    /**
     * Lookup for output indexes
        
     protected Map<String, Integer> m_outputMetaIndexLookup = new HashMap<String, Integer>(); */

    /**
     * Lookup output indexes for just input fields that are being copied to the output
     */
    protected Map<String, Integer> m_nonScriptOutputMetaIndexLookup = new HashMap<String, Integer>();

    /**
     * Variables to retrieve or columns present in pandas data frame that are not defined in the output meta
     */
    protected List<String> m_varsOrColsNotDefinedInOutputMeta = new ArrayList<String>();

    /**
     * Variables or columns defined in the output meta that are not present in the variables to retrieve or columns in the pandas data frame.
     * Script logic (based on input values) could dictate that some variables are not set or dataframe columns not generated for some reason.
     */
    protected List<String> m_varsOrColsInOutputMetaNotPresent = new ArrayList<String>();

    protected boolean m_first = true;

    /**
     * holds any unset variables for a script execution
     */
    protected List<String> m_unsetVars = new ArrayList<String>();

    protected Set<String> m_tmpSet = new HashSet<String>();

    protected static String loadScriptFromFile(String file) throws KettleException {
        FileObject scriptF = KettleVFS.getFileObject(file);

        BufferedReader br = null;
        StringBuilder b = new StringBuilder();
        try {
            if (!scriptF.exists()) {
                throw new KettleException(
                        BaseMessages.getString(PKG, "RScriptExecutorData.Error.ScriptFileDoesNotExist", file));
            }

            InputStream is = KettleVFS.getInputStream(scriptF);
            InputStreamReader isr = new InputStreamReader(is);
            br = new BufferedReader(isr);

            String line = null;
            while ((line = br.readLine()) != null) {
                b.append(line).append("\n");
            }

            br.close();
            br = null;
        } catch (IOException e) {
            throw new KettleException(e);
        } finally {
            if (br != null) {
                try {
                    br.close();
                    br = null;
                } catch (IOException e) {
                    throw new KettleException(e);
                }
            }
        }

        return b.toString();
    }

    /**
     * Initialise a lookup on output indexes of any fields being copied from input to output. User may re-order
     * output fields in the dialog for this step, so we need the lookup.
     */
    public void initNonScriptOutputIndexLookup() {
        for (ValueMetaInterface v : m_incomingFieldsIncludedInOutputRowMeta.getValueMetaList()) {
            int outIndex = m_outputRowMeta.indexOfValue(v.getName());
            m_nonScriptOutputMetaIndexLookup.put(v.getName(), outIndex);
        }
    }

    /**
     * Constructs outgoing rows when a single pandas data frame is being retrieved from python
     *
     * @param session         the session to use
     * @param frameName       the name of the frame to get
     * @param includeRowIndex true if the frame's row index is to be an output field (this can be useful in some cases - e.g.
     *                        using pandas routine to compute quantiles of columns stores the quantile value in the index of
     *                        the resulting data frame)
     * @param log             the log to use
     * @return output rows holding the values from the data frame
     * @throws KettleException if a problem occurs
     */
    public Object[][] constructOutputRowsFromFrame(PythonSession session, String frameName, boolean includeRowIndex,
            LogChannelInterface log) throws KettleException {

        PythonSession.RowMetaAndRows fromPy = session.rowsFromPythonDataFrame(frameName, includeRowIndex);
        RowMetaInterface frameMeta = fromPy.m_rowMeta;
        Object[][] frameRows = fromPy.m_rows;
        Object[][] outputRows = new Object[frameRows.length][];

        if (log.isDetailed()) {
            StringBuilder colsNotDefinedInOutputMeta = new StringBuilder();
            StringBuilder colsInOutputMetaNotPresentInFrame = new StringBuilder();

            for (ValueMetaInterface vm : frameMeta.getValueMetaList()) {
                if (m_outputRowMeta.indexOfValue(vm.getName()) < 0) {
                    colsNotDefinedInOutputMeta.append(vm.getName()).append(" ");
                }
            }

            for (ValueMetaInterface vm : m_outputRowMeta.getValueMetaList()) {
                if (!m_nonScriptOutputMetaIndexLookup.containsKey(vm.getName())
                        && frameMeta.indexOfValue(vm.getName()) < 0) {
                    colsInOutputMetaNotPresentInFrame.append(vm.getName()).append(" ");
                }
            }

            if (colsNotDefinedInOutputMeta.length() > 0) {
                log.logDetailed(BaseMessages.getString(PKG,
                        "CPythonScriptExecutor.Message.VarsOrColsNotDefinedInOutputMeta",
                        colsNotDefinedInOutputMeta.toString()));
            }
            if (colsInOutputMetaNotPresentInFrame.length() > 0) {
                log.logDebug(
                        BaseMessages.getString(PKG, "CPythonScriptExecutor.Message.OutputFieldsNotPresentOrSet",
                                colsInOutputMetaNotPresentInFrame.toString()));
            }
        }

        for (int i = 0; i < frameRows.length; i++) {
            Object[] frameRow = frameRows[i];
            outputRows[i] = RowDataUtil.allocateRowData(m_outputRowMeta.size());
            for (int j = 0; j < frameMeta.size(); j++) {
                ValueMetaInterface vmF = frameMeta.getValueMeta(j);
                int outputIndex = m_outputRowMeta.indexOfValue(vmF.getName());
                if (outputIndex >= 0) {
                    outputRows[i][outputIndex] = frameRow[j];
                }
            }
        }

        return outputRows;
    }

    /**
     * Constructs an outgoing row in the case where more than one variable is being extracted from python. In this
     * case there is just one output row after executing the script. Each outgoing field holds the value (either a
     * string or a serializable) of one of the requested python variables.
     *
     * @param session             the session to use
     * @param varsToGet           the list of variables to extract from python
     * @param continueOnUnsetVars true if we should not complain if a requested variable is not set in python after
     *                            the script executes
     * @param log                 the log to use
     * @return an output row
     * @throws KettleException if a problem occurs
     */
    public Object[] constructOutputRowNonFrame(PythonSession session, List<String> varsToGet,
            boolean continueOnUnsetVars, LogChannelInterface log) throws KettleException {
        Object[] outputRow = RowDataUtil.allocateRowData(m_outputRowMeta.size());

        // check var names against output meta
        if (m_first) {
            // check for vars that are not defined in output meta
            for (String v : varsToGet) {
                // if ( !m_outputMetaIndexLookup.containsKey( v ) ) {
                if (m_outputRowMeta.indexOfValue(v) < 0) {
                    m_varsOrColsNotDefinedInOutputMeta.add(v);
                }
            }

            if (m_varsOrColsNotDefinedInOutputMeta.size() > 0 && log != null) {
                StringBuilder b = new StringBuilder();
                for (String v : m_varsOrColsNotDefinedInOutputMeta) {
                    b.append(v).append(" ");
                }
                log.logDetailed(BaseMessages.getString(PKG,
                        "CPythonScriptExecutor.Message.VarsOrColsNotDefinedInOutputMeta", b.toString()));
            }
            m_first = false;
        }

        m_tmpSet.clear();
        m_unsetVars.clear();

        // get the values of the variables
        for (String v : varsToGet) {
            if (session.checkIfPythonVariableIsSet(v)) {
                // add this to the ok list, so we can check to see if there
                // are vars defined in output meta that are not set or not
                // present in the list of vars to get
                m_tmpSet.add(v);

                int outputIndex = m_outputRowMeta.indexOfValue(v);
                if (outputIndex >= 0) {
                    PythonSession.PythonVariableType varType = session.getPythonVariableType(v);
                    if (varType == PythonSession.PythonVariableType.Image) {
                        if (m_outputRowMeta.getValueMeta(outputIndex)
                                .getType() != ValueMetaInterface.TYPE_SERIALIZABLE) {
                            throw new KettleException(BaseMessages.getString(PKG,
                                    "CPythonScriptExecutor.Error.ImageDataMustBeStoredInSerializable"));
                        }
                        outputRow[outputIndex] = session.getImageFromPython(v);
                    } else {
                        Object varVal = session.getVariableValueFromPythonAsPlainString(v);
                        if (m_outputRowMeta.getValueMeta(outputIndex).getType() != ValueMetaInterface.TYPE_STRING) {
                            varVal = m_outputRowMeta.getValueMeta(outputIndex)
                                    //.convertData( new ValueMeta( v, ValueMetaInterface.TYPE_STRING ), varVal );
                                    .convertData(
                                            ValueMetaFactory.createValueMeta(v, ValueMetaInterface.TYPE_STRING),
                                            varVal);
                        }
                        outputRow[outputIndex] = varVal;
                    }
                }
            } else {
                if (!continueOnUnsetVars) {
                    m_unsetVars.add(v);
                }
            }
        }

        if (m_unsetVars.size() > 0) {
            StringBuilder b = new StringBuilder();
            for (String v : m_unsetVars) {
                b.append(v).append(" ");
            }
            throw new KettleException(
                    BaseMessages.getString(PKG, "CPythonScriptExecutor.Error.PythonVariableNotSet", b.toString()));
        }

        if (m_tmpSet.size() != m_outputRowMeta.size()) {
            StringBuilder b = new StringBuilder();
            for (ValueMetaInterface outV : m_scriptOnlyOutputRowMeta.getValueMetaList()) {
                if (!m_tmpSet.contains(outV.getName())) {
                    b.append(outV.getName()).append(" ");
                }
            }
            if (log != null) {
                log.logDetailed(BaseMessages.getString(PKG,
                        "CPythonScriptExecutor.Message.OutputFieldsNotPresentOrSet", b.toString()));
            }
        }

        return outputRow;
    }

    /**
     * Remove any null entries from a sample
     *
     * @param sample the sample to check.
     */
    public static void pruneNullRowsFromSample(List<Object[]> sample) {
        int pos = sample.size() - 1;

        // remove from the end to avoid internal shifting.
        while (pos > 0 && sample.get(pos) == null) {
            sample.remove(pos);
            pos--;
        }
    }

    /**
     * Generate some random rows to send to python in the case where a single variable (data frame) is being extracted
     * and we want to try and determine the types of the output fields
     *
     * @param inputMeta incoming row meta
     * @param r         Random instance to use
     * @return a list of randomly generated rows with types matching the incoming row types.
     * @throws KettleException if a problem occurs
     */
    protected static List<Object[]> generateRandomRows(RowMetaInterface inputMeta, Random r)
            throws KettleException {
        List<Object[]> rows = new ArrayList<Object[]>(NUM_RANDOM_ROWS);
        // ValueMetaInterface numericVM = new ValueMeta( "num", ValueMetaInterface.TYPE_NUMBER ); //$NON-NLS-1$
        ValueMetaInterface numericVM = ValueMetaFactory.createValueMeta("num", ValueMetaInterface.TYPE_NUMBER); //$NON-NLS-1$

        for (int i = 0; i < NUM_RANDOM_ROWS; i++) {
            Object[] currentRow = new Object[inputMeta.size()];
            for (int j = 0; j < inputMeta.size(); j++) {
                ValueMetaInterface vm = inputMeta.getValueMeta(j);

                ValueMetaInterface tempVM = vm.clone();
                tempVM.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL);

                Object newVal;
                double d = r.nextDouble();
                switch (vm.getType()) {
                case ValueMetaInterface.TYPE_NUMBER:
                case ValueMetaInterface.TYPE_INTEGER:
                case ValueMetaInterface.TYPE_BIGNUMBER:
                    d *= 100.0;
                    newVal = d;
                    if (vm.getStorageType() == ValueMetaInterface.STORAGE_TYPE_BINARY_STRING) {
                        newVal = tempVM.convertData(numericVM, newVal);
                    }
                    currentRow[j] = vm.getStorageType() == ValueMetaInterface.STORAGE_TYPE_NORMAL
                            ? vm.convertData(numericVM, newVal)
                            : tempVM.convertToBinaryStringStorageType(newVal);
                    break;
                case ValueMetaInterface.TYPE_DATE:
                    newVal = new Date(new Date().getTime() + (long) (d * 100000));
                    currentRow[j] = vm.getStorageType() == ValueMetaInterface.STORAGE_TYPE_NORMAL ? newVal
                            : tempVM.convertToBinaryStringStorageType(newVal);
                    break;
                case ValueMetaInterface.TYPE_TIMESTAMP:
                    newVal = new Timestamp(new Date().getTime() + (long) (d * 100000));
                    currentRow[j] = vm.getStorageType() == ValueMetaInterface.STORAGE_TYPE_NORMAL ? newVal
                            : tempVM.convertToBinaryStringStorageType(newVal);
                    break;
                case ValueMetaInterface.TYPE_BOOLEAN:
                    newVal = r.nextBoolean();
                    currentRow[j] = vm.getStorageType() == ValueMetaInterface.STORAGE_TYPE_NORMAL ? newVal
                            : tempVM.convertToBinaryStringStorageType(newVal);
                    break;
                default:
                    newVal = d < 0.5 ? "value1" : "value2";
                    currentRow[j] = vm.getStorageType() == ValueMetaInterface.STORAGE_TYPE_NORMAL ? newVal
                            : tempVM.convertToBinaryStringStorageType(newVal);
                }
            }
            rows.add(currentRow);
        }
        return rows;
    }

    public static PythonSession.RowMetaAndRows determineOutputMetaSingleVariable(Object requester,
            List<RowMetaInterface> inputMetas, CPythonScriptExecutorMeta cPythonScriptExecutorMeta,
            LogChannelInterface log, VariableSpace vars) throws KettleException {

        synchronized (requester) {
            PythonSession.RowMetaAndRows outputMeta = null;
            PythonSession session = null;
            try {
                String varToGet = vars
                        .environmentSubstitute(cPythonScriptExecutorMeta.getPythonVariablesToGet().get(0));

                String script = cPythonScriptExecutorMeta.getScript();
                if (cPythonScriptExecutorMeta.getLoadScriptAtRuntime()) {
                    String fileName = vars.environmentSubstitute(cPythonScriptExecutorMeta.getScriptToLoad());
                    if (Const.isEmpty(fileName)) {
                        throw new KettleException(BaseMessages.getString(PKG,
                                "CPythonScriptExecutorData.Error.ScriptFileDoesNotExist", fileName));
                    }

                    script = loadScriptFromFile(fileName);
                }

                List<String> frameNames = cPythonScriptExecutorMeta.getFrameNames();
                if (Const.isEmpty(script)) {
                    throw new KettleException(
                            BaseMessages.getString(PKG, "CPythonScriptExecutorData.Error.CantDetermineOutputMeta"));
                }

                if (inputMetas != null && inputMetas.size() != frameNames.size()) {
                    throw new KettleException(
                            BaseMessages.getString(PKG, "CPythonScriptExecutorData.Error.WrongNumberOfFrameNames"));
                }

                log.logDetailed(
                        BaseMessages.getString(PKG, "CPythonScriptExecutorData.Message.DeterminingOutputFormat"));

                Random r = new Random(1);

                session = acquirePySession(requester, log);

                List<List<Object[]>> randomRows = new ArrayList<List<Object[]>>();
                if (inputMetas != null) {
                    for (int i = 0; i < inputMetas.size(); i++) {
                        RowMetaInterface currentMeta = inputMetas.get(i);
                        String currentFrameName = vars.environmentSubstitute(frameNames.get(i));
                        List<Object[]> randomRow = generateRandomRows(currentMeta, r);
                        randomRows.add(randomRow);
                        session.rowsToPythonDataFrame(currentMeta, randomRow, currentFrameName);
                    }
                }

                List<String> outAndErrors = session.executeScript(script);
                if (!Const.isEmpty(outAndErrors.get(1))) {
                    throw new KettleException(outAndErrors.get(1));
                }
                if (!session.checkIfPythonVariableIsSet(varToGet)) {
                    throw new KettleException(BaseMessages.getString(PKG,
                            "CPythonScriptExecutor.Error.PythonVariableNotSet", varToGet));
                }

                // determine type...
                PythonSession.PythonVariableType type = session.getPythonVariableType(varToGet);
                if (type == PythonSession.PythonVariableType.DataFrame) {
                    PythonSession.RowMetaAndRows result = session.rowsFromPythonDataFrame(varToGet,
                            cPythonScriptExecutorMeta.getIncludeFrameRowIndexAsOutputField());
                    return result;
                } else {
                    // this variable is some other type
                    ValueMetaInterface vm = type == PythonSession.PythonVariableType.Image ?
                    //new ValueMeta( varToGet, ValueMetaInterface.TYPE_SERIALIZABLE ) :
                            ValueMetaFactory.createValueMeta(varToGet, ValueMetaInterface.TYPE_SERIALIZABLE) :
                            //new ValueMeta( varToGet, ValueMetaInterface.TYPE_STRING );
                            ValueMetaFactory.createValueMeta(varToGet, ValueMetaInterface.TYPE_STRING);
                    PythonSession.RowMetaAndRows result = new PythonSession.RowMetaAndRows();
                    result.m_rowMeta = new RowMeta();
                    result.m_rowMeta.addValueMeta(vm);
                    return result;
                }
            } catch (Exception ex) {
                throw new KettleException(ex);
            } finally {
                releasePySession(requester);
            }
        }
    }

    public static void initPython() throws KettleException {
        // check python availability
        if (!PythonSession.pythonAvailable()) {
            // initialize...
            PythonSession.initSession("python");
        } else {
            return;
        }
        if (!PythonSession.pythonAvailable()) {
            String pyCheckResults = PythonSession.getPythonEnvCheckResults();
            if (!Const.isEmpty(pyCheckResults)) {
                throw new KettleException(
                        BaseMessages.getString(PKG, "CPythonScriptExecutor.Error.PythonInitializationProblem")
                                + ":\n\n" + pyCheckResults);
            } else {
                throw new KettleException(
                        BaseMessages.getString(PKG, "CPythonScriptExecutor.Error.PythonInitializationProblem"));
            }
        }
    }

    public static PythonSession acquirePySession(Object requester, LogChannelInterface log) throws KettleException {
        // check availability first...
        initPython();

        PythonSession session;
        try {
            session = PythonSession.acquireSession(requester);
        } catch (SessionException ex) {
            throw new KettleException(ex);
        }

        session.setLog(log);
        return session;
    }

    protected static void releasePySession(Object requester) {
        PythonSession.releaseSession(requester);
    }
}