com.graphiq.kettle.steps.uniquelist.UniqueListStep.java Source code

Java tutorial

Introduction

Here is the source code for com.graphiq.kettle.steps.uniquelist.UniqueListStep.java

Source

/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/

package com.graphiq.kettle.steps.uniquelist;

import org.apache.commons.lang.StringUtils;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;

import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.regex.Pattern;

/**
 * This class is part of the demo step plug-in implementation.
 * It demonstrates the basics of developing a plug-in step for PDI. 
 * 
 * The demo step adds a new string field to the row stream and sets its
 * value to "Hello World!". The user may select the name of the new field.
 *   
 * This class is the implementation of StepInterface.
 * Classes implementing this interface need to:
 * 
 * - initialize the step
 * - execute the row processing logic
 * - dispose of the step 
 * 
 * Please do not create any local fields in a StepInterface class. Store any
 * information related to the processing logic in the supplied step data interface
 * instead.  
 * 
 */

public class UniqueListStep extends BaseStep implements StepInterface {

    /**
     * The constructor should simply pass on its arguments to the parent class.
     * 
     * @param s                 step description
     * @param stepDataInterface    step data class
     * @param c                    step copy
     * @param t                    transformation description
     * @param dis                transformation executing
     */
    public UniqueListStep(StepMeta s, StepDataInterface stepDataInterface, int c, TransMeta t, Trans dis) {
        super(s, stepDataInterface, c, t, dis);
    }

    /**
     * This method is called by PDI during transformation startup. 
     * 
     * It should initialize required for step execution. 
     * 
     * The meta and data implementations passed in can safely be cast
     * to the step's respective implementations. 
     * 
     * It is mandatory that super.init() is called to ensure correct behavior.
     * 
     * Typical tasks executed here are establishing the connection to a database,
     * as wall as obtaining resources, like file handles.
     * 
     * @param smi     step meta interface implementation, containing the step settings
     * @param sdi    step data interface implementation, used to store runtime information
     * 
     * @return true if initialization completed successfully, false if there was an error preventing the step from working. 
     *  
     */
    public boolean init(StepMetaInterface smi, StepDataInterface sdi) {
        // Casting to step-specific implementation classes is safe
        UniqueListMeta meta = (UniqueListMeta) smi;
        UniqueListData data = (UniqueListData) sdi;

        data.sourceFields = meta.getSourceFields();
        data.sourceDelims = meta.getSourceDelims();
        data.outputDelims = meta.getOutputDelims();
        data.outputFields = meta.getOutputFields();
        data.resultFields = new LinkedList<String>();

        return super.init(meta, data);
    }

    /**
     * Once the transformation starts executing, the processRow() method is called repeatedly
     * by PDI for as long as it returns true. To indicate that a step has finished processing rows
     * this method must call setOutputDone() and return false;
     * 
     * Steps which process incoming rows typically call getRow() to read a single row from the
     * input stream, change or add row content, call putRow() to pass the changed row on 
     * and return true. If getRow() returns null, no more rows are expected to come in, 
     * and the processRow() implementation calls setOutputDone() and returns false to
     * indicate that it is done too.
     * 
     * Steps which generate rows typically construct a new row Object[] using a call to
     * RowDataUtil.allocateRowData(numberOfFields), add row content, and call putRow() to
     * pass the new row on. Above process may happen in a loop to generate multiple rows,
     * at the end of which processRow() would call setOutputDone() and return false;
     * 
     * @param smi the step meta interface containing the step settings
     * @param sdi the step data interface that should be used to store
     * 
     * @return true to indicate that the function should be called again, false if the step is done
     */
    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {

        // safely cast the step settings (meta) and runtime info (data) to specific implementations 
        UniqueListMeta meta = (UniqueListMeta) smi;
        UniqueListData data = (UniqueListData) sdi;

        // get incoming row, getRow() potentially blocks waiting for more rows, returns null if no more rows expected
        Object[] r = getRow();

        // if no more rows are expected, indicate step is finished and processRow() should not be called again
        if (r == null) {
            setOutputDone();
            return false;
        }

        // the "first" flag is inherited from the base step implementation
        // it is used to guard some processing tasks, like figuring out field indexes
        // in the row structure that only need to be done once
        if (first) {
            first = false;
            // clone the input row structure and place it in our data object
            data.outputRowMeta = getInputRowMeta().clone();
            data.originalLength = data.outputRowMeta.size();
            // use meta.getFields() to change it, so it reflects the output row structure 
            meta.getFields(data.outputRowMeta, getStepname(), null, null, this, null, null);
        }

        // safely add the string "Hello World!" at the end of the output row
        // the row array will be resized if necessary
        data.resultFields.clear();
        for (int i = 0; i < data.sourceFields.length; i++) {
            data.fieldIndex = data.outputRowMeta.indexOfValue(data.sourceFields[i]);
            data.joinDelim = Const.NVL(data.outputDelims[i], data.sourceDelims[i]);
            data.result = dedupe((String) r[data.fieldIndex], data.sourceDelims[i], data.joinDelim,
                    meta.isConsecutiveDelimsAsOne());
            if (Const.NVL(data.outputFields[i], null) == null) {
                // this means we want to overwrite the previous value
                r[data.fieldIndex] = data.result;
            } else {
                // else we want to add a new field
                data.resultFields.add(data.result);
            }
        }
        // add the new fields if necessary
        if (data.resultFields.size() > 0) {
            r = RowDataUtil.addRowData(r, data.originalLength,
                    data.resultFields.toArray(new String[data.resultFields.size()]));
        }

        // put the row to the output row stream
        putRow(data.outputRowMeta, r);

        // log progress if it is time to to so
        if (checkFeedback(getLinesRead())) {
            logBasic("Linenr " + getLinesRead()); // Some basic logging
        }

        // indicate that processRow() should be called again
        return true;
    }

    public String dedupe(String source, String sourceDelim, String outputDelim, boolean removeBlanks) {
        if (source == null) {
            return null;
        }
        String[] items = source.split(Pattern.quote(sourceDelim));
        LinkedHashSet<String> uniques = new LinkedHashSet<String>();
        Collections.addAll(uniques, items);
        if (removeBlanks) {
            uniques.remove("");
        }
        return StringUtils.join(uniques, outputDelim);
    }

    /**
     * This method is called by PDI once the step is done processing. 
     * 
     * The dispose() method is the counterpart to init() and should release any resources
     * acquired for step execution like file handles or database connections.
     * 
     * The meta and data implementations passed in can safely be cast
     * to the step's respective implementations. 
     * 
     * It is mandatory that super.dispose() is called to ensure correct behavior.
     * 
     * @param smi     step meta interface implementation, containing the step settings
     * @param sdi    step data interface implementation, used to store runtime information
     */
    public void dispose(StepMetaInterface smi, StepDataInterface sdi) {

        // Casting to step-specific implementation classes is safe
        UniqueListMeta meta = (UniqueListMeta) smi;
        UniqueListData data = (UniqueListData) sdi;

        super.dispose(meta, data);
    }

}