org.pentaho.di.trans.steps.mongodboutput.MongoDbOutput.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.di.trans.steps.mongodboutput.MongoDbOutput.java

Source

/*!
 * Copyright 2010 - 2013 Pentaho Corporation.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.pentaho.di.trans.steps.mongodboutput;

import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.mongo.MongoDbException;

import com.mongodb.CommandResult;
import com.mongodb.DBObject;
import com.mongodb.MongoException;
import com.mongodb.ServerAddress;
import com.mongodb.WriteResult;
import org.pentaho.mongo.wrapper.MongoWrapperUtil;

/**
 * Class providing an output step for writing data to a MongoDB collection. Supports insert, truncate, upsert,
 * multi-update (update all matching docs) and modifier update (update only certain fields) operations. Can also create
 * and drop indexes based on one or more fields.
 *
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 */
public class MongoDbOutput extends BaseStep implements StepInterface {
    private static Class<?> PKG = MongoDbOutputMeta.class;

    protected MongoDbOutputMeta m_meta;
    protected MongoDbOutputData m_data;

    protected MongoDbOutputData.MongoTopLevel m_mongoTopLevelStructure = MongoDbOutputData.MongoTopLevel.INCONSISTENT;

    /**
     * The batch size to use for insert operation
     */
    protected int m_batchInsertSize = 100;

    /**
     * Holds a batch of rows converted to documents
     */
    protected List<DBObject> m_batch;

    /**
     * Holds an original batch of rows (corresponding to the converted documents)
     */
    protected List<Object[]> m_batchRows;

    protected int m_writeRetries = MongoDbOutputMeta.RETRIES;
    protected int m_writeRetryDelay = MongoDbOutputMeta.RETRY_DELAY;

    public MongoDbOutput(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);
    }

    @Override
    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {

        Object[] row = getRow();

        if (row == null) {
            // no more output

            // check any remaining buffered objects
            if (m_batch != null && m_batch.size() > 0) {
                try {
                    doBatch();
                } catch (MongoDbException e) {
                    throw new KettleException(e);
                }
            }

            // INDEXING - http://www.mongodb.org/display/DOCS/Indexes
            // Indexing is computationally expensive - it needs to be
            // done after all data is inserted and done in the BACKGROUND.

            // UNIQUE indexes (prevent duplicates on the
            // keys in the index) and SPARSE indexes (don't index docs that
            // don't have the key field) - current limitation is that SPARSE
            // indexes can only have a single field

            List<MongoDbOutputMeta.MongoIndex> indexes = m_meta.getMongoIndexes();
            if (indexes != null && indexes.size() > 0) {
                logBasic(BaseMessages.getString(PKG, "MongoDbOutput.Messages.ApplyingIndexOpps")); //$NON-NLS-1$
                try {
                    m_data.applyIndexes(indexes, log, m_meta.getTruncate());
                } catch (MongoDbException e) {
                    throw new KettleException(e);
                }
            }

            disconnect();
            setOutputDone();
            return false;
        }

        if (first) {
            first = false;

            m_batchInsertSize = 100;

            String batchInsert = environmentSubstitute(m_meta.getBatchInsertSize());
            if (!Const.isEmpty(batchInsert)) {
                m_batchInsertSize = Integer.parseInt(batchInsert);
            }
            m_batch = new ArrayList<DBObject>(m_batchInsertSize);
            m_batchRows = new ArrayList<Object[]>();

            // output the same as the input
            m_data.setOutputRowMeta(getInputRowMeta());

            // scan for top-level JSON document insert and validate
            // field specification in this case.
            m_data.m_hasTopLevelJSONDocInsert = MongoDbOutputData
                    .scanForInsertTopLevelJSONDoc(m_meta.getMongoFields());

            // first check our incoming fields against our meta data for
            // fields to
            // insert
            // this fields is came to step input
            RowMetaInterface rmi = getInputRowMeta();
            // this fields we are going to use for mongo output
            List<MongoDbOutputMeta.MongoField> mongoFields = m_meta.getMongoFields();
            checkInputFieldsMatch(rmi, mongoFields);

            // copy and initialize mongo fields
            m_data.setMongoFields(m_meta.getMongoFields());
            m_data.init(MongoDbOutput.this);

            // check truncate
            if (m_meta.getTruncate()) {
                try {
                    logBasic(BaseMessages.getString(PKG, "MongoDbOutput.Messages.TruncatingCollection")); //$NON-NLS-1$
                    m_data.getCollection().remove();
                } catch (Exception m) {
                    disconnect();
                    throw new KettleException(m.getMessage(), m);
                }
            }
        }

        if (!isStopped()) {

            if (m_meta.getUpdate()) {
                DBObject updateQuery = MongoDbOutputData.getQueryObject(m_data.getMongoFields(), getInputRowMeta(),
                        row, MongoDbOutput.this, m_mongoTopLevelStructure);

                if (log.isDebug()) {
                    logDebug(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Debug.QueryForUpsert", //$NON-NLS-1$
                            updateQuery));
                }

                if (updateQuery != null) {
                    // i.e. we have some non-null incoming query field values
                    DBObject insertUpdate = null;

                    // get the record to update the match with
                    if (!m_meta.getModifierUpdate()) {
                        // complete record replace or insert

                        insertUpdate = MongoDbOutputData.kettleRowToMongo(m_data.getMongoFields(),
                                getInputRowMeta(), row, MongoDbOutput.this, m_mongoTopLevelStructure,
                                m_data.m_hasTopLevelJSONDocInsert);
                        if (log.isDebug()) {
                            logDebug(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Debug.InsertUpsertObject", //$NON-NLS-1$
                                    insertUpdate));
                        }

                    } else {

                        // specific field update (or insert)
                        try {
                            insertUpdate = m_data.getModifierUpdateObject(m_data.getMongoFields(),
                                    getInputRowMeta(), row, MongoDbOutput.this, m_mongoTopLevelStructure);
                        } catch (MongoDbException e) {
                            throw new KettleException(e);
                        }
                        if (log.isDebug()) {
                            logDebug(
                                    BaseMessages.getString(PKG, "MongoDbOutput.Messages.Debug.ModifierUpdateObject", //$NON-NLS-1$
                                            insertUpdate));
                        }
                    }

                    if (insertUpdate != null) {
                        commitUpdate(updateQuery, insertUpdate, row);
                    }
                }
            } else {
                // straight insert

                DBObject mongoInsert = MongoDbOutputData.kettleRowToMongo(m_data.getMongoFields(),
                        getInputRowMeta(), row, MongoDbOutput.this, m_mongoTopLevelStructure,
                        m_data.m_hasTopLevelJSONDocInsert);

                if (mongoInsert != null) {
                    m_batch.add(mongoInsert);
                    m_batchRows.add(row);
                }
                if (m_batch.size() == m_batchInsertSize) {
                    logDetailed(BaseMessages.getString(PKG, "MongoDbOutput.Messages.CommitingABatch")); //$NON-NLS-1$
                    try {
                        doBatch();
                    } catch (MongoDbException e) {
                        throw new KettleException(e);
                    }
                }
            }
        }

        return true;
    }

    protected void commitUpdate(DBObject updateQuery, DBObject insertUpdate, Object[] row) throws KettleException {

        int retrys = 0;
        MongoException lastEx = null;

        while (retrys <= m_writeRetries && !isStopped()) {
            WriteResult result = null;
            CommandResult cmd = null;
            try {
                // TODO It seems that doing an update() via a secondary node does not
                // generate any sort of exception or error result! (at least via
                // driver version 2.11.1). Transformation completes successfully
                // but no updates are made to the collection.
                // This is unlike doing an insert(), which generates
                // a MongoException if you are not talking to the primary. So we need
                // some logic to check whether or not the connection configuration
                // contains the primary in the replica set and give feedback if it
                // doesnt
                try {
                    result = m_data.getCollection().update(updateQuery, insertUpdate, m_meta.getUpsert(),
                            m_meta.getMulti());
                } catch (MongoDbException e) {
                    throw new MongoException(e.getMessage(), e);
                }

                cmd = result.getLastError();
                if (cmd != null && !cmd.ok()) {
                    String message = cmd.getErrorMessage();
                    logError(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.MongoReported", message)); //$NON-NLS-1$

                    cmd.throwOnError();
                }
            } catch (MongoException me) {
                lastEx = me;
                retrys++;
                if (retrys <= m_writeRetries) {
                    logError(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.ErrorWritingToMongo", //$NON-NLS-1$
                            me.toString()));
                    logBasic(
                            BaseMessages.getString(PKG, "MongoDbOutput.Messages.Message.Retry", m_writeRetryDelay)); //$NON-NLS-1$
                    try {
                        Thread.sleep(m_writeRetryDelay * 1000);
                        // CHECKSTYLE:OFF
                    } catch (InterruptedException e) {
                        // CHECKSTYLE:ON
                    }
                }
            }

            if (cmd != null && cmd.ok()) {
                break;
            }
        }

        if ((retrys > m_writeRetries || isStopped()) && lastEx != null) {

            // Send this one to the error stream if doing error handling
            if (getStepMeta().isDoingErrorHandling()) {
                putError(getInputRowMeta(), row, 1, lastEx.getMessage(), "", "MongoDbOutput");
            } else {
                throw new KettleException(lastEx);
            }
        }
    }

    protected CommandResult batchRetryUsingSave(boolean lastRetry)
            throws MongoException, KettleException, MongoDbException {
        WriteResult result = null;
        CommandResult cmd = null;
        int count = 0;
        logBasic(BaseMessages.getString(PKG, "MongoDbOutput.Messages.CurrentBatchSize", m_batch.size()));
        for (int i = 0, len = m_batch.size(); i < len; i++) {
            DBObject toTry = m_batch.get(i);
            Object[] correspondingRow = m_batchRows.get(i);
            try {
                result = m_data.getCollection().save(toTry);
                cmd = result.getLastError();

                if (cmd != null && !cmd.ok()) {
                    String message = cmd.getErrorMessage();
                    logError(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.MongoReported", message)); //$NON-NLS-1$

                    cmd.throwOnError();
                }

                count++;
            } catch (MongoException ex) {
                if (!lastRetry) {
                    logBasic(BaseMessages.getString(PKG, "MongoDbOutput.Messages.SuccessfullySavedXDocuments",
                            count));
                    m_batch = copyExceptFirst(count, m_batch);
                    m_batchRows = copyExceptFirst(count, m_batchRows);
                    throw ex;
                }

                // Send this one to the error stream if doing error handling
                if (getStepMeta().isDoingErrorHandling()) {
                    putError(getInputRowMeta(), correspondingRow, 1, ex.getMessage(), "", "MongoDbOutput");
                } else {
                    m_batch = copyExceptFirst(i + 1, m_batch);
                    m_batchRows = copyExceptFirst(i + 1, m_batchRows);
                    throw ex;
                }
            }
        }

        m_batch.clear();
        m_batchRows.clear();

        logBasic(BaseMessages.getString(PKG, "MongoDbOutput.Messages.SuccessfullySavedXDocuments", count));

        return cmd;
    }

    private static <T> List<T> copyExceptFirst(int amount, List<T> list) {
        return new ArrayList<T>(list.subList(amount, list.size()));
    }

    protected void doBatch() throws KettleException, MongoDbException {
        int retries = 0;
        MongoException lastEx = null;

        while (retries <= m_writeRetries && !isStopped()) {
            WriteResult result = null;
            CommandResult cmd = null;
            try {
                if (retries == 0) {
                    result = m_data.getCollection().insert(m_batch);
                    cmd = result.getLastError();

                    if (cmd != null && !cmd.ok()) {
                        String message = cmd.getErrorMessage();
                        logError(
                                BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.MongoReported", message)); //$NON-NLS-1$

                        cmd.throwOnError();
                    }
                } else {
                    // fall back to save
                    logBasic(BaseMessages.getString(PKG,
                            "MongoDbOutput.Messages.SavingIndividualDocsInCurrentBatch"));
                    cmd = batchRetryUsingSave(retries == m_writeRetries);
                }
            } catch (MongoException me) {
                // avoid exception if a timeout issue occurred and it was exactly the first attempt
                boolean shouldNotBeAvoided = !isTimeoutException(me) && (retries == 0);
                if (shouldNotBeAvoided) {
                    lastEx = me;
                }
                retries++;
                if (retries <= m_writeRetries) {
                    if (shouldNotBeAvoided) {
                        // skip logging error
                        // however do not skip saving elements separately during next attempt to prevent losing data
                        logError(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.ErrorWritingToMongo", //$NON-NLS-1$
                                me.toString()));
                        logBasic(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Message.Retry", //$NON-NLS-1$
                                m_writeRetryDelay));
                    }
                    try {
                        Thread.sleep(m_writeRetryDelay * 1000);
                        // CHECKSTYLE:OFF
                    } catch (InterruptedException e) {
                        // CHECKSTYLE:ON
                    }
                }
                // throw new KettleException(me.getMessage(), me);
            }

            if (cmd != null) {
                ServerAddress s = cmd.getServerUsed();
                if (s != null) {
                    logDetailed(
                            BaseMessages.getString(PKG, "MongoDbOutput.Messages.WroteBatchToServer", s.toString())); //$NON-NLS-1$
                }
            }

            if (cmd != null && cmd.ok()) {
                break;
            }
        }

        if ((retries > m_writeRetries || isStopped()) && lastEx != null) {
            throw new KettleException(lastEx);
        }

        m_batch.clear();
        m_batchRows.clear();
    }

    private static boolean isTimeoutException(MongoException me) {
        return (me instanceof MongoException.Network) && (me.getCause() instanceof SocketTimeoutException);
    }

    @Override
    public boolean init(StepMetaInterface stepMetaInterface, StepDataInterface stepDataInterface) {
        if (super.init(stepMetaInterface, stepDataInterface)) {
            m_meta = (MongoDbOutputMeta) stepMetaInterface;
            m_data = (MongoDbOutputData) stepDataInterface;

            if (!Const.isEmpty(m_meta.getWriteRetries())) {
                m_writeRetries = Const.toInt(m_meta.getWriteRetries(), MongoDbOutputMeta.RETRIES);
            }

            if (!Const.isEmpty(m_meta.getWriteRetryDelay())) {
                m_writeRetryDelay = Const.toInt(m_meta.getWriteRetryDelay(), MongoDbOutputMeta.RETRY_DELAY);
            }

            String hostname = environmentSubstitute(m_meta.getHostnames());
            int port = Const.toInt(environmentSubstitute(m_meta.getPort()), 27017);
            String db = environmentSubstitute(m_meta.getDbName());
            String collection = environmentSubstitute(m_meta.getCollection());

            try {
                if (Const.isEmpty(db)) {
                    throw new Exception(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.NoDBSpecified")); //$NON-NLS-1$
                }

                if (Const.isEmpty(collection)) {
                    throw new Exception(
                            BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.NoCollectionSpecified")); //$NON-NLS-1$
                }

                if (!Const.isEmpty(m_meta.getAuthenticationUser())) {
                    String authInfo = (m_meta.getUseKerberosAuthentication()
                            ? BaseMessages.getString(PKG, "MongoDbOutput.Message.KerberosAuthentication",
                                    environmentSubstitute(m_meta.getAuthenticationUser()))
                            : BaseMessages.getString(PKG, "MongoDbOutput.Message.NormalAuthentication",
                                    environmentSubstitute(m_meta.getAuthenticationUser())));

                    logBasic(authInfo);
                }
                m_data.setConnection(MongoWrapperUtil.createMongoClientWrapper(m_meta, this, log)); //MongoDbOutputData.connect( m_meta, this, log ) );

                if (Const.isEmpty(collection)) {
                    throw new KettleException(
                            BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.NoCollectionSpecified")); //$NON-NLS-1$
                }
                m_data.createCollection(db, collection);
                m_data.setCollection(m_data.getConnection().getCollection(db, collection));

                try {
                    m_mongoTopLevelStructure = MongoDbOutputData.checkTopLevelConsistency(m_meta.getMongoFields(),
                            MongoDbOutput.this);

                    if (m_mongoTopLevelStructure == MongoDbOutputData.MongoTopLevel.INCONSISTENT) {
                        logError(BaseMessages.getString(PKG,
                                "MongoDbOutput.Messages.Error.InconsistentMongoTopLevel")); //$NON-NLS-1$
                        return false;
                    }
                } catch (KettleException e) {
                    logError(e.getMessage());
                    return false;
                }

                return true;
            } catch (UnknownHostException ex) {
                logError(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.UnknownHost", hostname), ex); //$NON-NLS-1$
                return false;
            } catch (Exception e) {
                logError(BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.ProblemConnecting", hostname, "" //$NON-NLS-1$ //$NON-NLS-2$
                        + port), e);
                return false;
            }
        }

        return false;
    }

    protected void disconnect() {
        if (m_data != null) {
            try {
                m_data.getConnection().dispose();
            } catch (MongoDbException e) {
                log.logError(e.getMessage());
            }
        }
    }

    @Override
    public void dispose(StepMetaInterface smi, StepDataInterface sdi) {
        disconnect();
        super.dispose(smi, sdi);
    }

    final void checkInputFieldsMatch(RowMetaInterface rmi, List<MongoDbOutputMeta.MongoField> mongoFields)
            throws KettleException {
        Set<String> expected = new HashSet<String>(mongoFields.size(), 1);
        Set<String> actual = new HashSet<String>(rmi.getFieldNames().length, 1);
        for (MongoDbOutputMeta.MongoField field : mongoFields) {
            String mongoMatch = environmentSubstitute(field.m_incomingFieldName);
            expected.add(mongoMatch);
        }
        for (int i = 0; i < rmi.size(); i++) {
            String metaFieldName = rmi.getValueMeta(i).getName();
            actual.add(metaFieldName);
        }

        // check that all expected fields is available in step input meta
        if (!actual.containsAll(expected)) {
            // in this case some fields willn't be found in input step meta
            expected.removeAll(actual);
            StringBuffer b = new StringBuffer();
            for (String name : expected) {
                b.append("'").append(name).append("', ");
            }
            throw new KettleException(BaseMessages.getString(PKG,
                    "MongoDbOutput.Messages.MongoField.Error.FieldsNotFoundInMetadata", b.toString()));
        }

        boolean found = actual.removeAll(expected);
        if (!found) {
            throw new KettleException(
                    BaseMessages.getString(PKG, "MongoDbOutput.Messages.Error.NotInsertingAnyFields")); //$NON-NLS-1$
        }

        if (!actual.isEmpty()) {
            // we have some fields that will not be inserted.
            StringBuffer b = new StringBuffer();
            for (String name : actual) {
                b.append("'").append(name).append("', ");
            }
            // just put a log record on it
            logBasic(BaseMessages.getString(PKG, "MongoDbOutput.Messages.FieldsNotToBeInserted", b.toString()));
        }
    }
}