com.twineworks.kettle.ruby.step.execmodels.SimpleExecutionModel.java Source code

Java tutorial

Introduction

Here is the source code for com.twineworks.kettle.ruby.step.execmodels.SimpleExecutionModel.java

Source

/*
 * Ruby for pentaho kettle
 * Copyright (C) 2017 Twineworks GmbH
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

package com.twineworks.kettle.ruby.step.execmodels;

import com.twineworks.kettle.ruby.step.*;
import com.twineworks.kettle.ruby.step.meta.RubyScriptMeta;
import com.twineworks.kettle.ruby.step.meta.RubyVariableMeta;
import com.twineworks.kettle.ruby.step.streams.*;
import org.apache.commons.lang.ArrayUtils;
import org.jruby.*;
import org.jruby.embed.EvalFailedException;
import org.jruby.exceptions.ThreadKill;
import org.jruby.javasupport.JavaEmbedUtils;
import org.jruby.javasupport.JavaUtil;
import org.jruby.runtime.builtin.IRubyObject;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.*;
import org.pentaho.di.core.row.value.ValueMetaInternetAddress;
import org.pentaho.di.core.row.value.ValueMetaTimestamp;
import org.pentaho.di.trans.step.errorhandling.StreamInterface;

import java.io.File;
import java.io.StringReader;
import java.math.BigDecimal;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.sql.Timestamp;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;

public class SimpleExecutionModel implements ExecutionModel {

    private RubyStepData data;
    private RubyStepMeta meta;
    private RubyStep step;

    @Override
    public void setEnvironment(RubyStep step, RubyStepData data, RubyStepMeta meta) {
        this.data = data;
        this.meta = meta;
        this.step = step;
    }

    @Override
    public boolean onInit() {

        try {

            data.forcedHalt = false;

            data.container = RubyStepFactory.createScriptingContainer(true);

            data.runtime = data.container.getProvider().getRuntime();

            // set gem home if specified
            setGemHome();

            data.container.setScriptFilename(meta.getRowScript().getTitle());
            data.rubyScriptObject = data.container.parse(meta.getRowScript().getScript(), 0);

            // put the usual stuff into global scope
            data.container.put("$step", step);
            data.container.put("$trans", step.getDispatcher());

            // put all variables into scope
            for (RubyVariableMeta var : meta.getRubyVariables()) {
                data.container.put(var.getName(), step.environmentSubstitute(var.getValue()));
            }

            // put all script tabs into scope
            RubyHash tabs = new RubyHash(data.runtime);

            for (RubyScriptMeta tab : meta.getScripts()) {
                tabs.put(tab.getTitle(), new ScriptTab(tab, data));
            }

            data.container.put("$tabs", tabs);

            // temporary place for the output a script might produce
            data.rowList = new LinkedList<Object[]>();

            // add << aliases to the java stream writers
            data.container.runScriptlet(
                    "JavaUtilities.extend_proxy('com.twineworks.kettle.ruby.step.streams.StdStreamWriter') {alias << write}\n");
            data.container.runScriptlet(
                    "JavaUtilities.extend_proxy('com.twineworks.kettle.ruby.step.streams.ErrorStreamWriter') {alias << write}\n");
            data.container.runScriptlet(
                    "JavaUtilities.extend_proxy('com.twineworks.kettle.ruby.step.streams.StepStreamWriter') {alias << write}\n");

        } catch (Exception e) {
            step.logError("Error Initializing Ruby Scripting Step", e);
            return false;
        }

        return true;
    }

    private void setGemHome() {

        // if specified directly, take it
        String gemHomeString = step.environmentSubstitute(meta.getGemHome());

        // if not, fall back to RUBY_GEM_HOME
        if (Const.isEmpty(gemHomeString) && !Const.isEmpty(step.getVariable("RUBY_GEM_HOME"))) {
            gemHomeString = step.environmentSubstitute("${RUBY_GEM_HOME}");
        }

        // if that fails, use the standard one
        if (Const.isEmpty(gemHomeString)) {
            gemHomeString = step.getPluginDir() + Const.FILE_SEPARATOR + "gems";
        }

        if (!Const.isEmpty(gemHomeString)) {

            File gemHomePath = new File(gemHomeString);
            gemHomePath = gemHomePath.getAbsoluteFile();

            String gemHome = gemHomePath.getAbsolutePath();
            data.container.runScriptlet("require 'rubygems/defaults/jruby';Gem::Specification.add_dir '" + gemHome
                    + "' unless Gem::Specification.dirs.member?( '" + gemHome + "' )");

        }
    }

    @Override
    public void onDispose() {

        data.marshal = null;
        data.bigDecimal = null;

        if (data.container != null) {
            data.container.terminate();
        }

        data.container = null;
        data.rubyScriptObject = null;
        data.runtime = null;

    }

    @Override
    public void onStopRunning() throws KettleException {

        // allow a few seconds for normal shutdown (i.e. completion of single row processing), before forcibly shutting things down
        new Thread() {
            public void run() {
                try {
                    Thread.sleep(5000);
                    forceStopRubyThreads();
                } catch (InterruptedException ignored) {
                }
            }
        }.start();

    }

    private void forceStopRubyThreads() {

        // if the container is disposed already, bail out
        if (data.container == null)
            return;

        // try to kill all threads once
        if (!data.forcedHalt) {
            data.forcedHalt = true;
        } else {
            return;
        }

        if (data.runtime != null) {

            RubyThread[] threads = data.runtime.getThreadService().getActiveRubyThreads();

            for (int i = 0; i < threads.length; i++) {
                try {
                    threads[i].kill();
                } catch (ThreadKill e) {
                }
            }

            data.runtime.tearDown();

        }

    }

    private IRubyObject getMarshal() {
        if (data.marshal == null) {
            data.marshal = data.container.parse("Marshal").run();
        }
        return data.marshal;
    }

    private IRubyObject getBigDecimal() {
        if (data.bigDecimal == null) {
            data.bigDecimal = data.container.parse("require 'bigdecimal'; BigDecimal").run();
        }
        return data.bigDecimal;
    }

    private IRubyObject getIPAddr() {
        if (data.ipAddr == null) {
            data.ipAddr = data.container.parse("require 'ipaddr'; IPAddr").run();
        }
        return data.ipAddr;
    }

    private void initMainRowStream() throws KettleException {

        // steps inputRowMeta might be null in case we have info steps only, or there's no input to begin with

        RowMetaInterface inputRowMeta = step.getInputRowMeta();
        if (inputRowMeta == null) {
            // when steps connect, but there's no rows, there's also no input row meta
            if (data.hasDirectInput) {
                inputRowMeta = step.getTransMeta().getPrevStepFields(step.getStepMeta());
            }
            // when steps don't connect, there's no fields
            else {
                inputRowMeta = new RowMeta();
            }

        }

        data.inputRowMeta = inputRowMeta.clone();
        data.inputFieldNames = data.inputRowMeta.getFieldNames();

        data.outputRowMeta = inputRowMeta.clone();
        meta.getFields(data.outputRowMeta, step.getStepname(), null, null, step, null, null);

        data.cacheFieldNames(data.inputRowMeta);
        data.cacheFieldNames(data.outputRowMeta);

        data.baseRowMeta = meta.isClearInputFields() ? data.emptyRowMeta : data.inputRowMeta;

        // put the standard streams into ruby scope
        data.container.put("$output", new StdStreamWriter(this));
        data.container.put("$input", new StdStreamReader(this));

        if (meta.getParentStepMeta().isDoingErrorHandling()) {

            data.errorRowMeta = meta.getParentStepMeta().getStepErrorMeta().getErrorFields().clone();
            data.stepErrorMeta = meta.getParentStepMeta().getStepErrorMeta();
            data.cacheFieldNames(data.errorRowMeta);

            data.container.put("$error", new ErrorStreamWriter(this));
        }

        // put the target steps into ruby scope
        RubyHash targetSteps = new RubyHash(data.runtime);

        int t = 0;
        for (StreamInterface stream : meta.getStepIOMeta().getTargetStreams()) {
            StepStreamWriter writer = new StepStreamWriter(this, stream.getStepname());
            targetSteps.put(meta.getTargetSteps().get(t).getRoleName(), writer);
            t++;
        }

        data.container.put("$target_steps", targetSteps);

    }

    public RubyHash createRubyInputRow(RowMetaInterface rowMeta, Object[] r) throws KettleException {

        // create a hash for the row, they are not reused on purpose, so the scripting user can safely use them to store entire rows between invocations
        RubyHash rubyRow = new RubyHash(data.runtime);

        String[] fieldNames = rowMeta.getFieldNames();
        for (int i = 0; i < fieldNames.length; i++) {

            String field = fieldNames[i];
            // null values don't need no special treatment, they'll become nil
            if (r[i] == null) {
                rubyRow.put(field, null);
            } else {

                ValueMetaInterface vm = rowMeta.getValueMeta(i);

                switch (vm.getType()) {
                case ValueMetaInterface.TYPE_BOOLEAN:
                    rubyRow.put(field, vm.getBoolean(r[i]));
                    break;
                case ValueMetaInterface.TYPE_INTEGER:
                    rubyRow.put(field, vm.getInteger(r[i]));
                    break;
                case ValueMetaInterface.TYPE_STRING:
                    rubyRow.put(field, vm.getString(r[i]));
                    break;
                case ValueMetaInterface.TYPE_NUMBER:
                    rubyRow.put(field, vm.getNumber(r[i]));
                    break;
                case ValueMetaInterface.TYPE_NONE:
                    rubyRow.put(field, r[i]);
                    break;
                case ValueMetaInterface.TYPE_SERIALIZABLE:
                    if (r[i] instanceof RubyStepMarshalledObject) {
                        Object restoredObject = getMarshal().callMethod(data.runtime.getCurrentContext(), "restore",
                                data.runtime.newString(r[i].toString()));
                        rubyRow.put(field, restoredObject);
                    } else {
                        // try to put the object in there as it is.. should create a nice adapter for the java object
                        rubyRow.put(field, r[i]);
                    }
                    break;
                case ValueMetaInterface.TYPE_BINARY:
                    // put a ruby array with bytes in there, that is expensive and should probably be avoided
                    rubyRow.put(fieldNames[i],
                            data.runtime.newArrayNoCopy(JavaUtil.convertJavaArrayToRuby(data.runtime,
                                    ArrayUtils.toObject((byte[]) vm.getBinary(r[i])))));

                    break;

                case ValueMetaInterface.TYPE_BIGNUMBER:
                    IRubyObject bigDecimalObject = getBigDecimal().callMethod(data.runtime.getCurrentContext(),
                            "new", data.runtime.newString((vm.getBigNumber(r[i])).toString()));
                    rubyRow.put(field, bigDecimalObject);
                    break;

                case ValueMetaInterface.TYPE_DATE:
                    rubyRow.put(field, data.runtime.newTime((vm.getDate(r[i])).getTime()));
                    break;

                case ValueMetaInterface.TYPE_TIMESTAMP:
                    ValueMetaTimestamp vmTimestamp = (ValueMetaTimestamp) vm;
                    Timestamp ts = vmTimestamp.getTimestamp(r[i]);
                    RubyTime rubyTime = data.runtime.newTime(ts.getTime() / 1000 * 1000);
                    rubyTime.setNSec(ts.getNanos());
                    rubyRow.put(field, rubyTime);
                    break;

                case ValueMetaInterface.TYPE_INET:
                    ValueMetaInternetAddress vmInet = (ValueMetaInternetAddress) vm;
                    InetAddress ip = vmInet.getInternetAddress(r[i]);
                    IRubyObject ipObject = getIPAddr().callMethod(data.runtime.getCurrentContext(), "new",
                            data.runtime.newString(ip.getHostAddress()));
                    rubyRow.put(field, ipObject);
                    break;
                }

            }

        }

        return rubyRow;

    }

    private void applyRubyHashToRow(Object[] r, RubyHash resultRow, List<ValueMetaInterface> forFields,
            RowMetaInterface forRow) throws KettleException {

        // set each field's value from the resultRow
        for (ValueMetaInterface outField : forFields) {

            IRubyObject rubyVal = resultRow.fastARef(data.rubyStringCache.get(outField.getName()));

            // convert simple cases automatically
            Object javaValue = null;

            // for nil values just put null into the row
            if (rubyVal != null && !rubyVal.isNil()) {

                // TODO: provide a meaningful error message if conversion fails because the user put non-convertible results in there (like a string saying "true"/"false" for the bool type)
                switch (outField.getType()) {
                case ValueMetaInterface.TYPE_BOOLEAN:
                    javaValue = JavaEmbedUtils.rubyToJava(data.runtime, rubyVal, Boolean.class);
                    break;
                case ValueMetaInterface.TYPE_INTEGER:
                    javaValue = JavaEmbedUtils.rubyToJava(data.runtime, rubyVal, Long.class);
                    break;
                case ValueMetaInterface.TYPE_STRING:
                    javaValue = rubyVal.toString();
                    break;
                case ValueMetaInterface.TYPE_NUMBER:
                    javaValue = JavaEmbedUtils.rubyToJava(data.runtime, rubyVal, Double.class);
                    break;
                case ValueMetaInterface.TYPE_SERIALIZABLE:
                    String marshalled = getMarshal().callMethod(data.runtime.getCurrentContext(), "dump", rubyVal)
                            .toString();
                    javaValue = new RubyStepMarshalledObject(marshalled);
                    break;
                case ValueMetaInterface.TYPE_BINARY:
                    // TODO: provide meaningful error message if this fails
                    RubyArray arr = rubyVal.convertToArray();

                    byte[] bytes = new byte[arr.size()];
                    for (int i = 0; i < bytes.length; i++) {
                        Object rItem = arr.get(i);
                        if (rItem instanceof Number) {
                            bytes[i] = ((Number) rItem).byteValue();
                        } else {
                            throw new KettleException("Found a non-number in Binary field " + outField.getName()
                                    + ": " + rItem.toString());
                        }
                    }
                    javaValue = bytes;
                    break;
                case ValueMetaInterface.TYPE_BIGNUMBER:
                    if (rubyVal instanceof RubyFloat) {
                        javaValue = new BigDecimal((Double) rubyVal.toJava(Double.class));
                    } else {
                        javaValue = new BigDecimal(rubyVal.toString());
                    }

                    break;
                case ValueMetaInterface.TYPE_DATE:
                    if (rubyVal instanceof RubyFixnum) {
                        javaValue = new Date(((RubyFixnum) rubyVal).getLongValue());
                    } else if (rubyVal instanceof RubyTime) {
                        javaValue = ((RubyTime) rubyVal).getJavaDate();
                    } else {
                        throw new KettleException(
                                "cannot convert ruby value " + rubyVal.toString() + " to java Date");
                    }
                    break;

                case ValueMetaInterface.TYPE_TIMESTAMP:
                    if (rubyVal instanceof RubyFixnum) {
                        javaValue = new java.sql.Timestamp(((RubyFixnum) rubyVal).getLongValue());
                    } else if (rubyVal instanceof RubyTime) {
                        RubyTime time = (RubyTime) rubyVal;
                        long millis = time.getDateTime().getMillis();
                        Timestamp ts = new java.sql.Timestamp(millis / 1000 * 1000);
                        ts.setNanos((int) ((time.getNSec()) + (millis % 1000 * 1000000)));
                        javaValue = ts;
                    } else {
                        throw new KettleException(
                                "cannot convert ruby value " + rubyVal.toString() + " to java.sql.Timestamp");
                    }
                    break;

                case ValueMetaInterface.TYPE_INET:
                    Long longNum = (Long) data.container.callMethod(rubyVal, "to_i");
                    javaValue = toInetAddress(longNum.intValue());
                    break;

                }

            }

            r[data.fieldIndexCache.get(forRow).get(outField.getName())] = javaValue;
        }

    }

    private byte[] toIPByteArray(int addr) {
        return new byte[] { (byte) (addr >>> 24), (byte) (addr >>> 16), (byte) (addr >>> 8), (byte) addr };
    }

    private InetAddress toInetAddress(int addr) {
        try {
            return InetAddress.getByAddress(toIPByteArray(addr));
        } catch (UnknownHostException e) {
            //should never happen
            return null;
        }
    }

    public void fetchRowsFromScriptOutput(IRubyObject rubyObject, RowMetaInterface inRow, Object[] r,
            List<Object[]> rowList, List<ValueMetaInterface> forFields, RowMetaInterface forRow)
            throws KettleException {

        // skip nil result rows
        if (rubyObject.isNil()) {
            return;
        }

        // ruby hashes are processed instantly
        if (rubyObject instanceof RubyHash) {
            // clone the row only if necessary
            if (rowList.size() > 0) {
                r = RowDataUtil.resizeArray(inRow.cloneRow(r), forRow.size());
            } else {
                r = RowDataUtil.resizeArray(r, forRow.size());
            }
            applyRubyHashToRow(r, (RubyHash) rubyObject, forFields, forRow);
            rowList.add(r);
            return;
        }

        // arrays are handled recursively:
        if (rubyObject instanceof RubyArray) {
            RubyArray rubyArray = (RubyArray) rubyObject;
            int length = rubyArray.getLength();
            for (int i = 0; i < length; i++) {
                fetchRowsFromScriptOutput(rubyArray.entry(i), inRow, r, rowList, forFields, forRow);
            }
            return;
        }

        // at this point the returned object is not nil, not a hash and not an array, let's ignore the output but warn in the log
        step.logBasic("WARNING: script returned non-hash value: " + rubyObject.toString() + " as a result ");

    }

    @Override
    public boolean onProcessRow() throws KettleException {

        // as calls to getRow() would yield rows from indeterminate sources unless
        // all info streams have been emptied first
        // we opt to enforce to have all info steps or no info steps
        try {

            Object[] r = null;

            if (step.first) {
                data.hasDirectInput = meta.hasDirectInput();
                // call the init script here rather than in the init section. It guarantees that other steps are fully initialized.
                if (meta.getInitScript() != null) {
                    data.container.runScriptlet(new StringReader(meta.getInitScript().getScript()),
                            meta.getInitScript().getTitle());
                }

                // this must be done before the first call to getRow() in case there are info streams present
                initInfoRowStreams();
            }

            // directinput means, there's no info steps and at least one step providing data
            if (data.hasDirectInput) {

                r = step.getRow();

                // only now is the metadata available
                if (step.first) {
                    initMainRowStream();
                    step.first = false;
                }

                // get the next row
                if (r != null) {

                    RubyHash rubyRow = createRubyInputRow(data.inputRowMeta, r);

                    // put the row into the container
                    data.container.put("$row", rubyRow);

                    // run the script, the result is one or more rows
                    IRubyObject scriptResult = data.rubyScriptObject.run();

                    data.rowList.clear();
                    fetchRowsFromScriptOutput(scriptResult, data.baseRowMeta, r, data.rowList,
                            meta.getAffectedFields(), data.outputRowMeta);

                    // now if the script has output rows, write them to the main output stream
                    for (Object[] outrow : data.rowList) {
                        step.putRow(data.outputRowMeta, outrow);
                    }

                    return true;
                } else {

                    // run the end script here rather then on dispose end, ensures that the row streams are still up, so user can choose to
                    // write "summary" rows and the like
                    if (meta.getDisposeScript() != null) {
                        data.container.runScriptlet(meta.getDisposeScript().getScript());
                    }

                    // no more rows coming in
                    step.setOutputDone();
                    return false;
                }

            }

            // no direct input means the script is not getting an input row and is executed exactly once
            else {
                if (step.first) {
                    initMainRowStream();
                    step.first = false;
                }
                r = new Object[data.outputRowMeta.size()];

                // run the script, the result is one or more rows
                IRubyObject scriptResult = data.rubyScriptObject.run();

                data.rowList.clear();
                fetchRowsFromScriptOutput(scriptResult, data.baseRowMeta, r, data.rowList, meta.getAffectedFields(),
                        data.outputRowMeta);

                // now if the script has output rows, write them to the main output stream
                for (Object[] outrow : data.rowList) {
                    step.putRow(data.outputRowMeta, outrow);
                }

                // run the end script here rather then on dispose end, ensures that the row streams are still up, so user can choose to
                // write "summary" rows and the like
                if (meta.getDisposeScript() != null) {
                    data.container.runScriptlet(meta.getDisposeScript().getScript());
                }

                step.setOutputDone();
                return false;
            }

        } catch (EvalFailedException e) {
            if (!data.forcedHalt) {
                throw new KettleException(e);
            }
            // transformation has been stopped
            return false;
        } catch (ThreadKill e) {
            if (!data.forcedHalt) {
                throw new KettleException(e);
            }
            // transformation has been stopped
            return false;
        }

    }

    private void initInfoRowStreams() throws KettleException {

        // put the info steps into ruby scope
        RubyHash infoSteps = new RubyHash(data.runtime);

        int i = 0;
        for (StreamInterface stream : meta.getStepIOMeta().getInfoStreams()) {

            StepStreamReader reader = new StepStreamReader(this, stream.getStepname());

            // if there's direct input connected as well as info streams present, the info streams *must* be prefetched as per 4.0 API
            if (data.hasDirectInput) {
                RubyArray allRows = reader.readAll();
                BufferStreamReader bReader = new BufferStreamReader(this, allRows);
                infoSteps.put(meta.getInfoSteps().get(i).getRoleName(), bReader);
            } else {
                infoSteps.put(meta.getInfoSteps().get(i).getRoleName(), reader);
            }

            i++;
        }

        data.container.put("$info_steps", infoSteps);

    }

    public RubyStep getStep() {
        return step;
    }

    public RubyStepData getData() {
        return data;
    }

}