com.panet.imeta.trans.steps.gpbulkloader.GPBulkLoader.java Source code

Java tutorial

Introduction

Here is the source code for com.panet.imeta.trans.steps.gpbulkloader.GPBulkLoader.java

Source

/* Copyright (c) 2007 Pentaho Corporation.  All rights reserved. 
* This software was developed by Pentaho Corporation and is provided under the terms 
* of the GNU Lesser General Public License, Version 2.1. You may not use 
* this file except in compliance with the license. If you need a copy of the license, 
* please go to http://www.gnu.org/licenses/lgpl-2.1.txt. The Original Code is Pentaho 
* Data Integration.  The Initial Developer is Pentaho Corporation.
*
* Software distributed under the GNU Lesser Public License is distributed on an "AS IS" 
* basis, WITHOUT WARRANTY OF ANY KIND, either express or  implied. Please refer to 
* the license for the specific language governing your rights and limitations.*/

package com.panet.imeta.trans.steps.gpbulkloader;

//
// The "designer" notes of the Greenplum bulkloader:
// ----------------------------------------------
//
// - "Enclosed" is used in the loader instead of "optionally enclosed" as optionally
//   encloses kind of destroys the escaping.
// - A Boolean is output as Y and N (as in the text output step e.g.). If people don't
//   like this they can first convert the boolean value to something else before loading
//   it.
// - Filters (besides data and datetime) are not supported as it slows down.
//
// 

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.commons.vfs.FileObject;

import com.panet.imeta.core.Const;
import com.panet.imeta.core.database.DatabaseMeta;
import com.panet.imeta.core.exception.KettleException;
import com.panet.imeta.core.row.RowMetaInterface;
import com.panet.imeta.core.vfs.KettleVFS;
import com.panet.imeta.trans.Trans;
import com.panet.imeta.trans.TransMeta;
import com.panet.imeta.trans.step.BaseStep;
import com.panet.imeta.trans.step.StepDataInterface;
import com.panet.imeta.trans.step.StepInterface;
import com.panet.imeta.trans.step.StepMeta;
import com.panet.imeta.trans.step.StepMetaInterface;

/**
 * Performs a bulk load to an Greenplum table.
 *
 * Based on (copied from) Sven Boden's Oracle Bulk Loader step
 * @author Luke Lonergan
 * @since  28-mar-2008
 */
public class GPBulkLoader extends BaseStep implements StepInterface {
    Process psqlProcess = null;

    private GPBulkLoaderMeta meta;
    private GPBulkLoaderData data;
    private GPBulkDataOutput output = null;

    /*
     * Local copy of the transformation "preview" property. We only forward
     * the rows upon previewing, we don't do any of the real stuff.
     */
    private boolean preview = false;

    //
    // This class continually reads from the stream, and sends it to the log
    // if the logging level is at least basic level.
    //
    final private class StreamLogger extends Thread {
        private InputStream input;
        private String type;

        StreamLogger(InputStream is, String type) {
            this.input = is;
            this.type = type + ">"; //$NON-NLS-1$
        }

        public void run() {
            try {
                final BufferedReader br = new BufferedReader(new InputStreamReader(input));
                String line;
                while ((line = br.readLine()) != null) {
                    // Only perform the concatenation if at basic level. Otherwise,
                    // this just reads from the stream.
                    if (log.isBasic()) {
                        logBasic(type + line);
                    }
                }
            } catch (IOException ioe) {
                ioe.printStackTrace();
            }

        }

    }

    public GPBulkLoader(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);
    }

    /**
     * Get the contents of the control file as specified in the meta object
     * 
     * @param meta the meta object to model the control file after
     * 
     * @return a string containing the control file contents
     */
    public String getControlFileContents(GPBulkLoaderMeta meta, RowMetaInterface rm, Object[] r)
            throws KettleException {
        DatabaseMeta dm = meta.getDatabaseMeta();
        String inputName = "'" + environmentSubstitute(meta.getDataFile()) + "'";

        //if ( GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(meta.getLoadMethod()) )
        //{
        //   // if loading is concurrent, the filename has to be a * as sqlldr will
        //   // read from stdin.
        //   inputName = "*";
        //}

        String loadAction = meta.getLoadAction();

        StringBuffer contents = new StringBuffer(500);

        String tableName = dm.getQuotedSchemaTableCombination(environmentSubstitute(meta.getSchemaName()),
                environmentSubstitute(meta.getTableName()));

        // Create a Postgres / Greenplum COPY string for use with a psql client
        if (loadAction.equalsIgnoreCase("truncate")) {
            contents.append(loadAction + " ");
            contents.append(tableName + ";");
            contents.append(Const.CR);
        }
        contents.append("\\COPY ");
        // Table name

        contents.append(tableName);

        // Names of columns

        contents.append(" ( ");

        String streamFields[] = meta.getFieldStream();
        String tableFields[] = meta.getFieldTable();

        if (streamFields == null || streamFields.length == 0) {
            throw new KettleException("No fields defined to load to database");
        }

        for (int i = 0; i < streamFields.length; i++) {
            if (i != 0)
                contents.append(", ");
            contents.append(dm.quoteField(tableFields[i]));
        }

        contents.append(" ) ");

        // The "FROM" filename
        contents.append(" FROM ");
        contents.append(inputName);

        // The "FORMAT" clause
        contents.append(" WITH CSV ");

        // The single row error handling
        contents.append("LOG ERRORS INTO ");
        contents.append(tableName + "_errors ");
        contents.append(" SEGMENT REJECT LIMIT ");
        contents.append(meta.getMaxErrors());

        //      contents.append("  ERRORS=\'").append(meta.getMaxErrors()).append("\'").append(Const.CR);

        //      contents.append("LOAD DATA").append(Const.CR).append(
        //                      "INFILE ").append(inputName).append(Const.CR).append(
        //                      "INTO TABLE ").append(dm.getQuotedSchemaTableCombination(environmentSubstitute(meta.getSchemaName()),
        //                                                                             environmentSubstitute(meta.getTableName()))).append(
        //                      Const.CR).append(loadAction).append(Const.CR).append(
        //                        "FIELDS TERMINATED BY ',' ENCLOSED BY '\"'").append(Const.CR).append(
        //                        "(");
        //
        //      String streamFields[] = meta.getFieldStream();
        //      String tableFields[] = meta.getFieldTable();
        //      String dateMask[] = meta.getDateMask();
        //
        //      if ( streamFields == null || streamFields.length == 0 )
        //      {
        //         throw new KettleException("No fields defined to load to database");
        //      }
        //
        //      for (int i = 0; i < streamFields.length; i++)
        //      {
        //         if ( i!=0 ) contents.append(", ").append(Const.CR);
        //         contents.append(dm.quoteField(tableFields[i]));
        //
        //         int pos = rm.indexOfValue(streamFields[i]);
        //         if (pos<0)
        //         {
        //            throw new KettleException("Could not find field " +
        //                                    streamFields[i] + " in stream");
        //         }
        //         ValueMetaInterface v = rm.getValueMeta(pos);
        //         switch ( v.getType() )
        //         {
        //             case ValueMetaInterface.TYPE_STRING:
        //                if ( v.getLength() > 255 )
        //                {
        //                   contents.append(" CHAR(").append(v.getLength()).append(")");
        //                }
        //                else
        //                {
        //                   contents.append(" CHAR");
        //                }
        //                break;
        //             case ValueMetaInterface.TYPE_INTEGER:
        //             case ValueMetaInterface.TYPE_NUMBER:
        //             case ValueMetaInterface.TYPE_BIGNUMBER:
        //                break;
        //             case ValueMetaInterface.TYPE_DATE:
        //                if ( GPBulkLoaderMeta.DATE_MASK_DATE.equals(dateMask[i]) )
        //                {
        //                    contents.append(" DATE 'yyyy-mm-dd'");
        //                }
        //                else if ( GPBulkLoaderMeta.DATE_MASK_DATETIME.equals(dateMask[i]) )
        //                {
        //                   contents.append(" TIMESTAMP 'yyyy-mm-dd hh24:mi:ss.ff'");
        //                }
        //                else
        //                {
        //                   // If not specified the default is date.
        //                   contents.append(" DATE 'yyyy-mm-dd'");
        //                }
        //                break;
        //             case ValueMetaInterface.TYPE_BINARY:
        //                contents.append(" ENCLOSED BY '<startlob>' AND '<endlob>'");
        //                break;
        //         }
        //      }
        //      contents.append(")");

        //if ( GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(meta.getLoadMethod()) )
        //{
        //   contents.append(Const.CR).append("BEGINDATA").append(Const.CR);
        //}

        return contents.toString();
    }

    /**
     * Create a control file.
     * 
     * @param filename
     * @param meta
     * @throws KettleException
     */
    public void createControlFile(String filename, Object[] row, GPBulkLoaderMeta meta) throws KettleException {
        File controlFile = new File(filename);
        FileWriter fw = null;

        try {
            controlFile.createNewFile();
            fw = new FileWriter(controlFile);
            fw.write(getControlFileContents(meta, getInputRowMeta(), row));
        } catch (IOException ex) {
            throw new KettleException(ex.getMessage(), ex);
        } finally {
            try {
                if (fw != null)
                    fw.close();
            } catch (Exception ex) {
            }
        }
    }

    /**
     * Create the command line for a psql process depending on the meta
     * information supplied.
     * 
     * @param meta The meta data to create the command line from
     * @param password Use the real password or not
     * 
     * @return The string to execute.
     * 
     * @throws KettleException Upon any exception
     */
    public String createCommandLine(GPBulkLoaderMeta meta, boolean password) throws KettleException {
        StringBuffer sb = new StringBuffer(300);

        if (meta.getPsqlpath() != null) {
            try {
                FileObject fileObject = KettleVFS.getFileObject(environmentSubstitute(meta.getPsqlpath()));
                String psqlexec = KettleVFS.getFilename(fileObject);
                sb.append('\'').append(psqlexec).append('\'');
            } catch (IOException ex) {
                throw new KettleException("Error retrieving sqlldr string", ex);
            }
        } else {
            throw new KettleException("No psql application specified");
        }

        if (meta.getControlFile() != null) {
            try {
                FileObject fileObject = KettleVFS.getFileObject(environmentSubstitute(meta.getControlFile()));

                sb.append(" -n -f ");
                sb.append('\'').append(KettleVFS.getFilename(fileObject)).append('\'');
            } catch (IOException ex) {
                throw new KettleException("Error retrieving controlfile string", ex);
            }
        } else {
            throw new KettleException("No control file specified");
        }

        if (meta.getLogFile() != null) {
            try {
                FileObject fileObject = KettleVFS.getFileObject(environmentSubstitute(meta.getLogFile()));

                sb.append(" -o ");
                sb.append('\'').append(KettleVFS.getFilename(fileObject)).append('\'');
            } catch (IOException ex) {
                throw new KettleException("Error retrieving logfile string", ex);
            }
        }

        DatabaseMeta dm = meta.getDatabaseMeta();
        if (dm != null) {
            String user = Const.NVL(dm.getUsername(), "");

            // Passwords will not work for now because we can't get them to the command line without assuming UNIX and using
            // an environment variable
            String pass = Const.NVL(dm.getPassword(), "");
            if (password && !pass.equalsIgnoreCase("")) {
                throw new KettleException(
                        "Passwords are not supported directly, try configuring your connection for trusted access using pg_hba.conf");
            }
            //           if ( ! password )
            //           {
            //              pass = "******";
            //           }
            //           String dns  = Const.NVL(dm.getDatabaseName(), "");
            //           sb.append(" -U ").append(environmentSubstitute(user)).append("/").append(environmentSubstitute(pass));

            sb.append(" -U ").append(environmentSubstitute(user));

            //Hostname and portname
            String hostname = Const.NVL(dm.getHostname(), "");
            String portnum = Const.NVL(dm.getDatabasePortNumberString(), "");
            sb.append(" -h ");
            sb.append(hostname);
            sb.append(" -p ");
            sb.append(portnum);

            // Database Name
            String dns = Const.NVL(dm.getDatabaseName(), "");
            sb.append(" -d ");
            String overrideName = meta.getDbNameOverride();
            if (Const.isEmpty(Const.rtrim(overrideName))) {
                sb.append(environmentSubstitute(dns));
            } else {
                // if the database name override is filled in, do that one.
                sb.append(environmentSubstitute(overrideName));
            }
        } else {
            throw new KettleException("No connection specified");
        }

        return sb.toString();
    }

    public boolean execute(GPBulkLoaderMeta meta, boolean wait) throws KettleException {
        Runtime rt = Runtime.getRuntime();

        try {
            psqlProcess = rt.exec(createCommandLine(meta, true));
            // any error message?
            StreamLogger errorLogger = new StreamLogger(psqlProcess.getErrorStream(), "ERROR");

            // any output?
            StreamLogger outputLogger = new StreamLogger(psqlProcess.getInputStream(), "OUTPUT");

            // kick them off
            errorLogger.start();
            outputLogger.start();

            if (wait) {
                // any error???
                int exitVal = psqlProcess.waitFor();
                logBasic(Messages.getString("GPBulkLoader.Log.ExitValuePsqlPath", "" + exitVal)); //$NON-NLS-1$
            }
        } catch (Exception ex) {
            // Don't throw the message upwards, the message contains the password.
            throw new KettleException("Error while executing psql \'" + createCommandLine(meta, false) + "\'");
        }

        return true;
    }

    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
        meta = (GPBulkLoaderMeta) smi;
        data = (GPBulkLoaderData) sdi;

        try {
            Object[] r = getRow(); // Get row from input rowset & set row busy!
            if (r == null) // no more input to be expected...
            {
                setOutputDone();

                if (!preview) {
                    if (output != null) {
                        // Close the output
                        try {
                            output.close();
                        } catch (IOException e) {
                            throw new KettleException("Error while closing output", e);
                        }

                        output = null;
                    }

                    String loadMethod = meta.getLoadMethod();
                    if (GPBulkLoaderMeta.METHOD_AUTO_END.equals(loadMethod)) {
                        execute(meta, true);
                    }
                    //   else if ( GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(meta.getLoadMethod()) )
                    //   {
                    //      try 
                    //      {
                    //         if ( psqlProcess != null )
                    //         {
                    //            int exitVal = psqlProcess.waitFor();
                    //            logBasic(Messages.getString("GPBulkLoader.Log.ExitValueSqlldr", "" + exitVal)); //$NON-NLS-1$
                    //         }
                    //         else
                    //         {
                    //            throw new KettleException("Internal error: no sqlldr process running");
                    //         }
                    //      }
                    //      catch ( Exception ex )
                    //      {
                    //         throw new KettleException("Error while executing sqlldr", ex);
                    //      }
                    //   }
                }
                return false;
            }

            if (!preview) {
                if (first) {
                    first = false;
                    createControlFile(environmentSubstitute(meta.getControlFile()), r, meta);
                    output = new GPBulkDataOutput(meta);

                    //   if ( GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(meta.getLoadMethod()) )
                    //   {
                    //      execute(meta, false);
                    //   }
                    output.open(this, psqlProcess);
                }
                output.writeLine(getInputRowMeta(), r);
            }
            putRow(getInputRowMeta(), r);
            incrementLinesOutput();

        } catch (KettleException e) {
            logError(Messages.getString("GPBulkLoader.Log.ErrorInStep") + e.getMessage()); //$NON-NLS-1$
            setErrors(1);
            stopAll();
            setOutputDone(); // signal end to receiver(s)
            return false;
        }

        return true;
    }

    public boolean init(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (GPBulkLoaderMeta) smi;
        data = (GPBulkLoaderData) sdi;

        Trans trans = getTrans();
        preview = trans.isPreview();

        if (super.init(smi, sdi)) {
            return true;
        }
        return false;
    }

    public void dispose(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (GPBulkLoaderMeta) smi;
        data = (GPBulkLoaderData) sdi;

        super.dispose(smi, sdi);

        if (!preview && meta.isEraseFiles()) {
            // Erase the created cfg/dat files if requested. We don't erase
            // the rest of the files because it would be "stupid" to erase them
            // right after creation. If you don't want them, don't fill them in.
            FileObject fileObject = null;

            String method = meta.getLoadMethod();
            if ( // GPBulkLoaderMeta.METHOD_AUTO_CONCURRENT.equals(method) ||
            GPBulkLoaderMeta.METHOD_AUTO_END.equals(method)) {
                if (meta.getControlFile() != null) {
                    try {
                        fileObject = KettleVFS.getFileObject(environmentSubstitute(meta.getControlFile()));
                        fileObject.delete();
                        fileObject.close();
                    } catch (IOException ex) {
                        logError("Error deleting control file \'" + KettleVFS.getFilename(fileObject) + "\': "
                                + ex.getMessage());
                    }
                }
            }

            if (GPBulkLoaderMeta.METHOD_AUTO_END.equals(method)) {
                // In concurrent mode the data is written to the control file.
                if (meta.getDataFile() != null) {
                    try {
                        fileObject = KettleVFS.getFileObject(environmentSubstitute(meta.getDataFile()));
                        fileObject.delete();
                        fileObject.close();
                    } catch (IOException ex) {
                        logError("Error deleting data file \'" + KettleVFS.getFilename(fileObject) + "\': "
                                + ex.getMessage());
                    }
                }
            }

            if (GPBulkLoaderMeta.METHOD_MANUAL.equals(method)) {
                logBasic("Deletion of files is not compatible with \'manual load method\'");
            }
        }
    }

    public String toString() {
        return this.getClass().getName();
    }

    //
    // Run is were the action happens!
    public void run() {
        try {
            logBasic(Messages.getString("System.Log.StartingToRun")); //$NON-NLS-1$

            while (processRow(meta, data) && !isStopped())
                ;
        } catch (Throwable t) {
            logError(Messages.getString("System.Log.UnexpectedError") + " : "); //$NON-NLS-1$ //$NON-NLS-2$
            logError(Const.getStackTracker(t));
            setErrors(1);
            stopAll();
        } finally {
            dispose(meta, data);
            logSummary();
            markStop();
        }
    }
}