cascading.jdbc.db.DBOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for cascading.jdbc.db.DBOutputFormat.java

Source

/*
 * Copyright (c) 2009 Concurrent, Inc.
 *
 * This work has been released into the public domain
 * by the copyright holder. This applies worldwide.
 *
 * In case this is not legally possible:
 * The copyright holder grants any entity the right
 * to use this work for any purpose, without any
 * conditions, unless such conditions are required by law.
 */
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.jdbc.db;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;

/**
 * A OutputFormat that sends the reduce output to a SQL table.
 * <p/>
 * {@link DBOutputFormat} accepts &lt;key,value&gt; pairs, where
 * key has a type extending DBWritable. Returned {@link RecordWriter}
 * writes <b>only the key</b> to the database with a batch SQL query.
 */
public class DBOutputFormat<K extends DBWritable, V> implements OutputFormat<K, V> {
    private static final Log LOG = LogFactory.getLog(DBOutputFormat.class);

    /** A RecordWriter that writes the reduce output to a SQL table */
    protected class DBRecordWriter implements RecordWriter<K, V> {
        private Connection connection;
        private PreparedStatement insertStatement;
        private PreparedStatement updateStatement;
        private final int statementsBeforeExecute;

        private long statementsAdded = 0;
        private long insertStatementsCurrent = 0;
        private long updateStatementsCurrent = 0;

        protected DBRecordWriter(Connection connection, PreparedStatement insertStatement,
                PreparedStatement updateStatement, int statementsBeforeExecute) {
            this.connection = connection;
            this.insertStatement = insertStatement;
            this.updateStatement = updateStatement;
            this.statementsBeforeExecute = statementsBeforeExecute;
        }

        /** {@inheritDoc} */
        public void close(Reporter reporter) throws IOException {
            executeBatch();

            try {
                if (insertStatement != null)
                    insertStatement.close();

                if (updateStatement != null)
                    updateStatement.close();

                connection.commit();
            } catch (SQLException exception) {
                rollBack();

                createThrowMessage("unable to commit batch", 0, exception);
            } finally {
                try {
                    connection.close();
                } catch (SQLException exception) {
                    throw new IOException("unable to close connection", exception);
                }
            }
        }

        private void executeBatch() throws IOException {
            try {
                if (insertStatementsCurrent != 0) {
                    LOG.info("executing insert batch " + createBatchMessage(insertStatementsCurrent));

                    insertStatement.executeBatch();
                }

                insertStatementsCurrent = 0;
            } catch (SQLException exception) {
                rollBack();

                createThrowMessage("unable to execute insert batch", insertStatementsCurrent, exception);
            }

            try {
                if (updateStatementsCurrent != 0) {
                    LOG.info("executing update batch " + createBatchMessage(updateStatementsCurrent));

                    int[] result = updateStatement.executeBatch();

                    int count = 0;

                    for (int value : result)
                        count += value;

                    if (count != updateStatementsCurrent)
                        throw new IOException(
                                "update did not update same number of statements executed in batch, batch: "
                                        + updateStatementsCurrent + " updated: " + count);
                }

                updateStatementsCurrent = 0;
            } catch (SQLException exception) {
                rollBack();

                createThrowMessage("unable to execute update batch", updateStatementsCurrent, exception);
            }
        }

        private void rollBack() {
            try {
                connection.rollback();
            } catch (SQLException sqlException) {
                LOG.warn(StringUtils.stringifyException(sqlException));
            }
        }

        private String createBatchMessage(long currentStatements) {
            return String.format("[totstmts: %d][crntstmts: %d][batch: %d]", statementsAdded, currentStatements,
                    statementsBeforeExecute);
        }

        private void createThrowMessage(String stateMessage, long currentStatements, SQLException exception)
                throws IOException {
            String message = exception.getMessage();

            message = message.substring(0, Math.min(75, message.length()));

            int messageLength = exception.getMessage().length();
            String batchMessage = createBatchMessage(currentStatements);
            String template = "%s [msglength: %d]%s %s";
            String errorMessage = String.format(template, stateMessage, messageLength, batchMessage, message);

            LOG.error(errorMessage, exception.getNextException());

            throw new IOException(errorMessage, exception.getNextException());
        }

        /** {@inheritDoc} */
        public synchronized void write(K key, V value) throws IOException {
            try {
                if (value == null) {
                    key.write(insertStatement);
                    insertStatement.addBatch();
                    insertStatementsCurrent++;
                } else {
                    key.write(updateStatement);
                    updateStatement.addBatch();
                    updateStatementsCurrent++;
                }
            } catch (SQLException exception) {
                throw new IOException("unable to add batch statement", exception);
            }

            statementsAdded++;

            if (statementsAdded % statementsBeforeExecute == 0)
                executeBatch();
        }
    }

    /**
     * Constructs the query used as the prepared statement to insert data.
     *
     * @param table      the table to insert into
     * @param fieldNames the fields to insert into. If field names are unknown, supply an
     *                   array of nulls.
     */
    protected String constructInsertQuery(String table, String[] fieldNames) {
        if (fieldNames == null)
            throw new IllegalArgumentException("Field names may not be null");

        StringBuilder query = new StringBuilder();

        query.append("INSERT INTO ").append(table);

        if (fieldNames.length > 0 && fieldNames[0] != null) {
            query.append(" (");

            for (int i = 0; i < fieldNames.length; i++) {
                query.append(fieldNames[i]);

                if (i != fieldNames.length - 1)
                    query.append(",");
            }

            query.append(")");

        }

        query.append(" VALUES (");

        for (int i = 0; i < fieldNames.length; i++) {
            query.append("?");

            if (i != fieldNames.length - 1)
                query.append(",");
        }

        query.append(");");

        return query.toString();
    }

    protected String constructUpdateQuery(String table, String[] fieldNames, String[] updateNames) {
        if (fieldNames == null)
            throw new IllegalArgumentException("field names may not be null");

        Set<String> updateNamesSet = new HashSet<String>();
        Collections.addAll(updateNamesSet, updateNames);

        StringBuilder query = new StringBuilder();

        query.append("UPDATE ").append(table);

        query.append(" SET ");

        if (fieldNames.length > 0 && fieldNames[0] != null) {
            int count = 0;

            for (int i = 0; i < fieldNames.length; i++) {
                if (updateNamesSet.contains(fieldNames[i]))
                    continue;

                if (count != 0)
                    query.append(",");

                query.append(fieldNames[i]);
                query.append(" = ?");

                count++;
            }
        }

        query.append(" WHERE ");

        if (updateNames.length > 0 && updateNames[0] != null) {
            for (int i = 0; i < updateNames.length; i++) {
                query.append(updateNames[i]);
                query.append(" = ?");

                if (i != updateNames.length - 1)
                    query.append(" and ");
            }
        }

        query.append(";");

        return query.toString();
    }

    /** {@inheritDoc} */
    public void checkOutputSpecs(FileSystem filesystem, JobConf job) throws IOException {
    }

    /** {@inheritDoc} */
    public RecordWriter<K, V> getRecordWriter(FileSystem filesystem, JobConf job, String name,
            Progressable progress) throws IOException {
        DBConfiguration dbConf = new DBConfiguration(job);

        String tableName = dbConf.getOutputTableName();
        String[] fieldNames = dbConf.getOutputFieldNames();
        String[] updateNames = dbConf.getOutputUpdateFieldNames();
        int batchStatements = dbConf.getBatchStatementsNum();

        Connection connection = dbConf.getConnection();

        configureConnection(connection);

        String sqlInsert = constructInsertQuery(tableName, fieldNames);
        PreparedStatement insertPreparedStatement;

        try {
            insertPreparedStatement = connection.prepareStatement(sqlInsert);
            insertPreparedStatement.setEscapeProcessing(true); // should be on by default
        } catch (SQLException exception) {
            throw new IOException("unable to create statement for: " + sqlInsert, exception);
        }

        String sqlUpdate = updateNames != null ? constructUpdateQuery(tableName, fieldNames, updateNames) : null;
        PreparedStatement updatePreparedStatement = null;

        try {
            updatePreparedStatement = sqlUpdate != null ? connection.prepareStatement(sqlUpdate) : null;
        } catch (SQLException exception) {
            throw new IOException("unable to create statement for: " + sqlUpdate, exception);
        }

        return new DBRecordWriter(connection, insertPreparedStatement, updatePreparedStatement, batchStatements);
    }

    protected void configureConnection(Connection connection) {
        setAutoCommit(connection);
    }

    protected void setAutoCommit(Connection connection) {
        try {
            connection.setAutoCommit(false);
        } catch (Exception exception) {
            throw new RuntimeException("unable to set auto commit", exception);
        }
    }

    /**
     * Initializes the reduce-part of the job with the appropriate output settings
     *
     * @param job                 The job
     * @param dbOutputFormatClass
     * @param tableName           The table to insert data into
     * @param fieldNames          The field names in the table. If unknown, supply the appropriate
     */
    public static void setOutput(JobConf job, Class<? extends DBOutputFormat> dbOutputFormatClass, String tableName,
            String[] fieldNames, String[] updateFields, int batchSize) {
        if (dbOutputFormatClass == null)
            job.setOutputFormat(DBOutputFormat.class);
        else
            job.setOutputFormat(dbOutputFormatClass);

        // writing doesn't always happen in reduce
        job.setReduceSpeculativeExecution(false);
        job.setMapSpeculativeExecution(false);

        DBConfiguration dbConf = new DBConfiguration(job);

        dbConf.setOutputTableName(tableName);
        dbConf.setOutputFieldNames(fieldNames);

        if (updateFields != null)
            dbConf.setOutputUpdateFieldNames(updateFields);

        if (batchSize != -1)
            dbConf.setBatchStatementsNum(batchSize);
    }
}