Java tutorial
/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.pgbulkloader; // // The "designer" notes of the PostgreSQL bulkloader: // ---------------------------------------------- // // Let's see how fast we can push data down the tube with the use of COPY FROM STDIN // // import java.math.BigDecimal; import org.apache.commons.vfs.FileObject; import org.pentaho.di.core.Const; import org.pentaho.di.core.database.DatabaseMeta; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleFileException; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.util.StreamLogger; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; /** * Performs a bulk load to a postgres table. * * Based on (copied from) Sven Boden's Oracle Bulk Loader step * * @author matt * @since 28-mar-2008 */ public class PGBulkLoader extends BaseStep implements StepInterface { private static Class<?> PKG = PGBulkLoaderMeta.class; // for i18n purposes, needed by Translator2!! private PGBulkLoaderMeta meta; private PGBulkLoaderData data; public PGBulkLoader(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans) { super(stepMeta, stepDataInterface, copyNr, transMeta, trans); } /** * Get the contents of the control file as specified in the meta object * * @param meta * the meta object to model the control file after * * @return a string containing the control file contents */ public String getCopyCommand(RowMetaInterface rm, Object[] r) throws KettleException { DatabaseMeta dm = meta.getDatabaseMeta(); String loadAction = environmentSubstitute(meta.getLoadAction()); StringBuffer contents = new StringBuffer(500); String tableName = dm.getQuotedSchemaTableCombination(environmentSubstitute(meta.getSchemaName()), environmentSubstitute(meta.getTableName())); // Set the date style... // // contents.append("SET DATESTYLE ISO;"); // This is the default but we set it anyway... // contents.append(Const.CR); // Create a Postgres / Greenplum COPY string for use with a psql client if (loadAction.equalsIgnoreCase("truncate")) { contents.append("TRUNCATE TABLE "); contents.append(tableName + ";"); contents.append(Const.CR); } contents.append("COPY "); // Table name contents.append(tableName); // Names of columns contents.append(" ( "); String[] streamFields = meta.getFieldStream(); String[] tableFields = meta.getFieldTable(); if (streamFields == null || streamFields.length == 0) { throw new KettleException("No fields defined to load to database"); } for (int i = 0; i < streamFields.length; i++) { if (i != 0) { contents.append(", "); } contents.append(dm.quoteField(tableFields[i])); } contents.append(" ) "); // The "FROM" filename contents.append(" FROM STDIN"); // FIFO file // The "FORMAT" clause contents.append(" WITH CSV DELIMITER AS '").append(environmentSubstitute(meta.getDelimiter())) .append("' QUOTE AS '").append(environmentSubstitute(meta.getEnclosure())).append("'"); contents.append(";").append(Const.CR); return contents.toString(); } /** * Create the command line for a psql process depending on the meta information supplied. * * @param meta * The meta data to create the command line from * @param password * Use the real password or not * * @return The string to execute. * * @throws KettleException * Upon any exception */ public String createCommandLine(PGBulkLoaderMeta meta, boolean password) throws KettleException { StringBuffer sb = new StringBuffer(300); if (meta.getPsqlpath() != null) { try { FileObject fileObject = KettleVFS.getFileObject(environmentSubstitute(meta.getPsqlpath()), getTransMeta()); String psqlexec = Const.optionallyQuoteStringByOS(KettleVFS.getFilename(fileObject)); sb.append(psqlexec); } catch (KettleFileException ex) { throw new KettleException("Error retrieving sqlldr string", ex); } } else { if (isDetailed()) { logDetailed("psql defaults to system path"); } sb.append("psql"); } DatabaseMeta dm = meta.getDatabaseMeta(); if (dm != null) { // Note: Passwords are not supported directly, try configuring your connection for trusted access using // pg_hba.conf // // The username // String user = Const.NVL(dm.getUsername(), ""); sb.append(" -U ").append(environmentSubstitute(user)); // Hostname and portname // String hostname = environmentSubstitute(Const.NVL(dm.getHostname(), "")); String portnum = environmentSubstitute(Const.NVL(dm.getDatabasePortNumberString(), "")); sb.append(" -h "); sb.append(hostname); sb.append(" -p "); sb.append(portnum); if (meta.isStopOnError()) { sb.append(" -v ON_ERROR_STOP=1"); } // Database Name // String dns = environmentSubstitute(Const.NVL(dm.getDatabaseName(), "")); sb.append(" "); String overrideName = environmentSubstitute(meta.getDbNameOverride()); if (Const.isEmpty(Const.rtrim(overrideName))) { sb.append(environmentSubstitute(dns)); } else { // if the database name override is filled in, do that one. sb.append(overrideName); } } else { throw new KettleException("No connection specified"); } return sb.toString(); } public boolean execute(PGBulkLoaderMeta meta, boolean wait) throws KettleException { Runtime rt = Runtime.getRuntime(); try { String cmd = createCommandLine(meta, true); logBasic("Executing command: " + cmd); data.psqlProcess = rt.exec(cmd); // any error message? // data.errorLogger = new StreamLogger(log, data.psqlProcess.getErrorStream(), "ERROR {0}"); // any output? data.outputLogger = new StreamLogger(log, data.psqlProcess.getInputStream(), "OUTPUT {0}"); // Where do we send the data to? --> To STDIN of the psql process // data.pgOutputStream = data.psqlProcess.getOutputStream(); // kick them off new Thread(data.errorLogger).start(); new Thread(data.outputLogger).start(); // OK, from here on, we need to feed in the COPY command followed by the data into the pgOutputStream // } catch (Exception ex) { throw new KettleException("Error while executing psql : " + createCommandLine(meta, false), ex); } return true; } public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException { meta = (PGBulkLoaderMeta) smi; data = (PGBulkLoaderData) sdi; try { Object[] r = getRow(); // Get row from input rowset & set row busy! if (r == null) { // no more input to be expected... setOutputDone(); // Close the output stream... // will be null if no records (empty stream) if (data != null && data.psqlProcess != null) { data.pgOutputStream.flush(); data.pgOutputStream.close(); // wait for the pgsql process to finish and check for any error... // int exitVal = data.psqlProcess.waitFor(); logBasic(BaseMessages.getString(PKG, "GPBulkLoader.Log.ExitValuePsqlPath", "" + exitVal)); if (meta.isStopOnError() && exitVal != 0) { // If we're supposed to stop on exception, then this is where. throw new KettleException( BaseMessages.getString(PKG, "PGBulkLoader.Exception.ExitValueNotZero", exitVal)); } } else { logBasic(BaseMessages.getString(PKG, "PGBulkLoader.Log.NullInputAndOrPSQLProcess")); } return false; } if (first) { first = false; // Cache field indexes. // data.keynrs = new int[meta.getFieldStream().length]; for (int i = 0; i < data.keynrs.length; i++) { data.keynrs[i] = getInputRowMeta().indexOfValue(meta.getFieldStream()[i]); } // execute the psql statement... // execute(meta, true); String copyCmd = getCopyCommand(getInputRowMeta(), r); logBasic("Launching command: " + copyCmd); data.pgOutputStream.write(copyCmd.getBytes()); // Write rows of data hereafter... // } writeRowToPostgres(getInputRowMeta(), r); putRow(getInputRowMeta(), r); incrementLinesOutput(); return true; } catch (Exception e) { logError(BaseMessages.getString(PKG, "GPBulkLoader.Log.ErrorInStep"), e); setErrors(1); stopAll(); setOutputDone(); // signal end to receiver(s) return false; } } private void writeRowToPostgres(RowMetaInterface rowMeta, Object[] r) throws KettleException { try { // So, we have this output stream to which we can write CSV data to. // Basically, what we need to do is write the binary data (from strings to it as part of this proof of concept) // // Let's assume the data is in the correct format here too. // for (int i = 0; i < data.keynrs.length; i++) { if (i > 0) { // Write a separator // data.pgOutputStream.write(data.separator); } int index = data.keynrs[i]; ValueMetaInterface valueMeta = rowMeta.getValueMeta(index); Object valueData = r[index]; if (valueData != null) { switch (valueMeta.getType()) { case ValueMetaInterface.TYPE_STRING: data.pgOutputStream.write(data.quote); // No longer dump the bytes for a Lazy Conversion; // We need to escape the quote characters in every string String quoteStr = new String(data.quote); String escapedString = valueMeta.getString(valueData).replace(quoteStr, quoteStr + quoteStr); data.pgOutputStream.write(escapedString.getBytes()); data.pgOutputStream.write(data.quote); break; case ValueMetaInterface.TYPE_INTEGER: if (valueMeta.isStorageBinaryString()) { data.pgOutputStream.write((byte[]) valueData); } else { data.pgOutputStream.write(Long.toString(valueMeta.getInteger(valueData)).getBytes()); } break; case ValueMetaInterface.TYPE_DATE: // Format the date in the right format. // switch (data.dateFormatChoices[i]) { // Pass the data along in the format chosen by the user OR in binary format... // case PGBulkLoaderMeta.NR_DATE_MASK_PASS_THROUGH: if (valueMeta.isStorageBinaryString()) { data.pgOutputStream.write((byte[]) valueData); } else { String dateString = valueMeta.getString(valueData); if (dateString != null) { data.pgOutputStream.write(dateString.getBytes()); } } break; // Convert to a "YYYY/MM/DD" format // case PGBulkLoaderMeta.NR_DATE_MASK_DATE: String dateString = data.dateMeta.getString(valueMeta.getDate(valueData)); if (dateString != null) { data.pgOutputStream.write(dateString.getBytes()); } break; // Convert to a "YYYY/MM/DD HH:MM:SS" (ISO) format // case PGBulkLoaderMeta.NR_DATE_MASK_DATETIME: String dateTimeString = data.dateTimeMeta.getString(valueMeta.getDate(valueData)); if (dateTimeString != null) { data.pgOutputStream.write(dateTimeString.getBytes()); } break; default: break; } break; case ValueMetaInterface.TYPE_BOOLEAN: if (valueMeta.isStorageBinaryString()) { data.pgOutputStream.write((byte[]) valueData); } else { data.pgOutputStream.write(Double.toString(valueMeta.getNumber(valueData)).getBytes()); } break; case ValueMetaInterface.TYPE_NUMBER: if (valueMeta.isStorageBinaryString()) { data.pgOutputStream.write((byte[]) valueData); } else { data.pgOutputStream.write(Double.toString(valueMeta.getNumber(valueData)).getBytes()); } break; case ValueMetaInterface.TYPE_BIGNUMBER: if (valueMeta.isStorageBinaryString()) { data.pgOutputStream.write((byte[]) valueData); } else { BigDecimal big = valueMeta.getBigNumber(valueData); if (big != null) { data.pgOutputStream.write(big.toString().getBytes()); } } break; default: break; } } } // Now write a newline // data.pgOutputStream.write(data.newline); } catch (Exception e) { throw new KettleException("Error serializing rows of data to the psql command", e); } } public boolean init(StepMetaInterface smi, StepDataInterface sdi) { meta = (PGBulkLoaderMeta) smi; data = (PGBulkLoaderData) sdi; String enclosure = environmentSubstitute(meta.getEnclosure()); String separator = environmentSubstitute(meta.getDelimiter()); if (super.init(smi, sdi)) { if (enclosure != null) { data.quote = enclosure.getBytes(); } else { data.quote = new byte[] {}; } if (separator != null) { data.separator = separator.getBytes(); } else { data.separator = new byte[] {}; } data.newline = Const.CR.getBytes(); data.dateFormatChoices = new int[meta.getFieldStream().length]; for (int i = 0; i < data.dateFormatChoices.length; i++) { if (Const.isEmpty(meta.getDateMask()[i])) { data.dateFormatChoices[i] = PGBulkLoaderMeta.NR_DATE_MASK_PASS_THROUGH; } else if (meta.getDateMask()[i].equalsIgnoreCase(PGBulkLoaderMeta.DATE_MASK_DATE)) { data.dateFormatChoices[i] = PGBulkLoaderMeta.NR_DATE_MASK_DATE; } else if (meta.getDateMask()[i].equalsIgnoreCase(PGBulkLoaderMeta.DATE_MASK_DATETIME)) { data.dateFormatChoices[i] = PGBulkLoaderMeta.NR_DATE_MASK_DATETIME; } else { // The default : just pass it along... data.dateFormatChoices[i] = PGBulkLoaderMeta.NR_DATE_MASK_PASS_THROUGH; } } return true; } return false; } }