oracle.kv.hadoop.table.TableInputFormatBase.java Source code

Introduction

Here is the source code for oracle.kv.hadoop.table.TableInputFormatBase.java
Source

/*-
 * Copyright (C) 2011, 2017 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.hadoop.table;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import oracle.kv.Consistency;
import oracle.kv.Direction;
import oracle.kv.KVSecurityConstants;
import oracle.kv.KVStoreException;
import oracle.kv.ParamConstant;
import oracle.kv.PasswordCredentials;
import oracle.kv.impl.security.PasswordManager;
import oracle.kv.impl.security.PasswordStore;
import oracle.kv.impl.security.login.LoginManager;
import oracle.kv.impl.security.util.KVStoreLogin;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.topo.split.SplitBuilder;
import oracle.kv.impl.topo.split.TopoSplit;
import oracle.kv.impl.util.ExternalDataSourceUtils;
import oracle.kv.impl.util.TopologyLocator;
import oracle.kv.impl.util.registry.ClientSocketFactory;
import oracle.kv.impl.util.registry.RegistryUtils;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;

/**
 * This is the base class for Oracle NoSQL Database InputFormat classes that
 * can be used to run MapReduce against data stored via the Table API.
 * Keys are of type PrimaryKey. Values are always of type Row.
 * <p>
 * Parameters may be passed using either the static setters on this class or
 * through the Hadoop JobContext configuration parameters. The following
 * parameters are recognized:
 * <ul>
 *
 * <li><code>oracle.kv.kvstore</code> - the KV Store name for this InputFormat
 * to operate on. This is equivalent to the {@link #setKVStoreName} method.
 *
 * <li><code>oracle.kv.hosts</code> - one or more <code>hostname:port</code>
 * pairs separated by commas naming hosts in the KV Store. This is equivalent
 * to the {@link #setKVHelperHosts} method.
 *
 * <li><code>oracle.kv.hadoop.hosts</code> - one or more <code>hostname</code>
 * strings separated by commas naming the Hadoop data node hosts in the
 * Hadoop cluster that will support MapReduce jobs and/or service Hive
 * queries. This is equivalent to the {@link #setKVHadoopHosts} method.
 * The value(s) specified by this property will be returned by the
 * <code>getLocations</code> method of <code>TableInputSplit</code>. If this
 * property is not specified, or if the {@link #setKVHadoopHosts} method is
 * not called, then the values specified via the <code>oracle.kv.hosts</code>
 * property (or the {@link #setKVHelperHosts} method) will be used instead.
 *
 * <li><code>oracle.kv.tableName</code> - the name of the table in the
 * store from which data will be retrieved. This is equivalent
 * to the {@link #setTableName} method.
 *
 * <li><code>oracle.kv.primaryKey</code> - Property whose value consists of
 * the components to use when constructing the key to employ when iterating
 * the table. The format of this property's value must be a list of name:value
 * pairs in JSON FORMAT like the following:
 * <code>
 *   -Doracle.kv.primaryKey="{\"name\":\"stringVal\",\"name\":floatVal}"
 * </code>
 * where the list itself is enclosed in un-escaped double quotes and
 * corresponding curly brace; and each field name component -- as well
 * as each STRING type field value component -- is enclosed in ESCAPED
 * double quotes.
 * <p>
 * In addition to the JSON format requirement above, the values referenced by
 * the various fieldValue components of this Property must satisfy the
 * semantics of PrimaryKey for the given table; that is, they must represent
 * a first-to-last subset of the table's primary key fields, and they must be
 * specified in the same order as those primary key fields. If the components
 * of this property do not satisfy these requirements, a full primary key
 * wildcard will be used when iterating the table.
 * <p>
 * This is equivalent to the {@link #setPrimaryKeyProperty} method.
 *
 * <li><code>oracle.kv.fieldRange</code> - Property whose value consists of
 * the components to use when constructing the field range to employ when
 * iterating the table. The format of this property's value must be a list
 * of name:value pairs in JSON FORMAT like the following:
 * <code>
 *   -Doracle.kv.fieldRange="{\"name\":\"fieldName\",
 *      \"start\":\"startVal\",[\"startInclusive\":true|false],
 *      \"end\"\"endVal\",[\"endInclusive\":true|false]}"
 * </code>
 * where for the given field over which to range, the 'start', and 'end'
 * components are required, and the 'startInclusive' and 'endInclusive'
 * components are optional; defaulting to 'true' if not included. Note
 * that the list itself is enclosed in un-escaped double quotes and
 * corresponding curly brace; and each name component and string type
 * value component is enclosed in ESCAPED double quotes.
 * <p>
 * In addition to the JSON format requirement above, the values referenced
 * by the components of this Property's value must also satisfy the semantics
 * of FieldRange; that is,
 * <ul>
 *   <li>the values associated with the target key must correspond to a
 *       valid primary key in the table
 *   <li>the value associated with the fieldName must be the name of a valid
 *       field of the primary key over which iteration will be performed
 *   <li>the values associated with the start and end of the range must
 *       correspond to valid values of the given fieldName
 *   <li>the value associated with either of the inclusive components
 *       must be either 'true' or 'false'
 * </ul>
 * If the components of this property do not satisfy these requirements, then
 * table iteration will be performed over the full range of values of the
 * PrimaryKey iteration rather than a sub-range.
 * <p>
 * This is equivalent to the {@link #setFieldRangeProperty} method.
 *
 * <li><code>oracle.kv.consistency</code> - Specifies the read consistency
 * associated with the lookup of the child KV pairs.  Version- and Time-based
 * consistency may not be used.  If null, the default consistency is used.
 * <p>
 * This is equivalent to the {@link #setConsistency} method.
 *
 * <li><code>oracle.kv.timeout</code> - Specifies an upper bound on the time
 * interval for processing a particular KV retrieval.  A best effort is made to
 * not exceed the specified limit. If zero, the default request timeout is
 * used. This value is always in milliseconds.
 * <p>
 * This is equivalent to the {@link #setTimeout} and {@link #setTimeoutUnit}
 * methods.
 *
 * <li><code>oracle.kv.maxRequests</code> - Specifies the maximum number of
 * client side threads to use when running an iteration; where a value of 1
 * causes the iteration to be performed using only the current thread, and a
 * value of 0 causes the client to base the number of threads to employ on
 * the current store topology.
 * <p>
 * This is equivalent to the {@link #setMaxRequests} method.
 *
 * <li><code>oracle.kv.batchSize</code> - Specifies the suggested number of
 * keys to fetch during each network round trip by the InputFormat.  If 0, an
 * internally determined default is used. This is equivalent to the {@link
 * #setBatchSize} method.
 *
 * <li><code>oracle.kv.maxBatches</code> - Specifies the maximum number of
 * result batches that can be held in memory on the client side before
 * processing on the server side pauses. This parameter can be used to prevent
 * the client side memory from being exceeded if the client cannot consume
 * results as fast as they are generated by the server side.
 * <p>
 * This is equivalent to the {@link #setMaxBatches} method.
 *
 * </ul>
 *
 * <p>
 * Internally, the TableInputFormatBase class utilizes the method
 * <code>
 *  oracle.kv.table.TableIterator<oracle.kv.table.Row> TableAPI.tableIterator
 * </code>
 * to retrieve records. You should refer to the javadoc for that method
 * for information about the various parameters.
 * <p>
 *
 * <code>TableInputFormatBase</code> dynamically generates a number of
 * splits, each encapsulating a list of sets in which the elements of each
 * set are partition ids over which can be retrieved in parallel; to
 * optimize retrieval performance. The "size" of each split that is
 * generated -- which will be the value returned by the <code>getLength</code>
 * method of <code>TableInputSplit</code> -- is the number of that
 * encapsulated list of partition id sets. If the consistency passed to
 * <code>TableInputFormatBase</code> is {@link Consistency#NONE_REQUIRED
 * NONE_REQUIRED} (the default), then {@link InputSplit#getLocations
 * InputSplit.getLocations()} will return an array of the names of the
 * master and the replica(s) which contain the partition.
 * Alternatively, if the consistency is {@link
 * Consistency#NONE_REQUIRED_NO_MASTER NONE_REQUIRED_NO_MASTER}, then
 * the array returned will contain only the names of the replica(s);
 * not the master.  Finally, if the consistency is {@link
 * Consistency#ABSOLUTE ABSOLUTE}, then the array returned will
 * contain only the name of the master.  This means that if Hadoop job
 * trackers are running on the nodes named in the returned
 * <code>location</code> array, Hadoop will generally attempt to run
 * the subtasks for a particular partition on those nodes where the
 * data is stored and replicated.  Hadoop and Oracle NoSQL DB
 * administrators should be careful about co-location of Oracle NoSQL
 * DB and Hadoop processes since they may compete for resources.
 *
 * <p>
 * {@link InputSplit#getLength InputSplit.getLength()} always returns 1.
 * <p>
 *
 * A simple example demonstrating the Oracle NoSQL DB Hadoop
 * <code>oracle.kv.hadoop.table.TableInputFormat</code> class reading
 * data from Hadoop in a MapReduce job and counting the number of rows
 * in a given table in the store can be found in the
 * <code>KVHOME/examples/hadoop/table</code> directory.  The javadoc
 * for that program describes the simple MapReduce processing as well as
 * how to invoke the program in Hadoop.
 * <p>
 * @since 3.1
 */
abstract class TableInputFormatBase<K, V> extends InputFormat<K, V> {

    private static final Log LOG = LogFactory.getLog("oracle.kv.hadoop.table.TableInputFormatBase");

    private static final String FILE_SEP = System.getProperty("file.separator");
    private static final String USER_SECURITY_DIR = System.getProperty("user.dir") + FILE_SEP
            + "TABLE_INPUT_FORMAT_SECURITY_DIR";

    /*
     * Static fields are used to support the MapReduce programming model;
     * where the MapReduce job is initalized with this class, and the
     * static setter methods of this class are used to initialize the
     * fields below for use in the job.
     */
    private static String kvStoreName = null;
    private static String[] kvHelperHosts = null;
    private static String[] kvHadoopHosts = null;
    private static String tableName = null;

    private static String primaryKeyProperty = null;

    /* For MultiRowOptions */
    private static String fieldRangeProperty = null;

    /* For TableIteratorOptions */
    private static Direction direction = Direction.UNORDERED;
    private static Consistency consistency = null;
    private static long timeout = 0;
    private static TimeUnit timeoutUnit = TimeUnit.MILLISECONDS;
    private static int maxRequests = 0;
    private static int batchSize = 0;
    private static int maxBatches = 0;

    /* Used by getSplits to initialize the splits that are created. */
    private static String loginFlnm = null;
    private static PasswordCredentials passwordCredentials = null;
    private static String trustFlnm = null;
    /* Used by getSplits to contact secure store locally. */
    private static String localLoginFile = null;

    private static int queryBy = TableInputSplit.QUERY_BY_PRIMARY_ALL_PARTITIONS;
    private static String whereClause = null;
    private static Integer shardKeyPartitionId = null;

    /**
     * @hidden
     */
    protected TableInputFormatBase() {
    }

    /**
     * @hidden
     * Logically split the set of input data for the job.
     *
     * @param context job configuration.
     *
     * @return an array of {@link InputSplit}s for the job.
     */
    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

        if (context != null) {
            final Configuration conf = context.getConfiguration();
            initializeParameters(conf);
        }

        if (kvStoreName == null) {
            throw new IllegalArgumentException(
                    "No KV Store Name provided. Use either the " + ParamConstant.KVSTORE_NAME.getName()
                            + " parameter or call " + TableInputFormatBase.class.getName() + ".setKVStoreName().");
        }

        if (kvHelperHosts == null) {
            throw new IllegalArgumentException("No KV Helper Hosts were provided. Use either the "
                    + ParamConstant.KVSTORE_NODES.getName() + " parameter or call "
                    + TableInputFormatBase.class.getName() + ".setKVHelperHosts().");
        }

        if (kvHadoopHosts == null) {
            kvHadoopHosts = new String[kvHelperHosts.length];
            for (int i = 0; i < kvHelperHosts.length; i++) {
                /* Strip off the ':port' suffix */
                final String[] hostPort = (kvHelperHosts[i]).trim().split(":");
                kvHadoopHosts[i] = hostPort[0];
            }
        }

        if (tableName == null) {
            throw new IllegalArgumentException(
                    "No Table Name provided. Use either the " + ParamConstant.TABLE_NAME.getName()
                            + " parameter or call " + TableInputFormatBase.class.getName() + ".setTableName().");
        }

        final String userName = (passwordCredentials == null ? null : passwordCredentials.getUsername());
        final KVStoreLogin storeLogin = new KVStoreLogin(userName, localLoginFile);
        storeLogin.loadSecurityProperties();
        storeLogin.prepareRegistryCSF();
        LoginManager loginMgr = null;

        if (storeLogin.foundSSLTransport()) {
            loginMgr = KVStoreLogin.getRepNodeLoginMgr(kvHelperHosts, passwordCredentials, kvStoreName);
        }

        /*
         * Retrieve the topology of the store.
         *
         * Note that if the same Hive CLI session is used to run queries that
         * must connect to different KVStores where one store is non-secure
         * and the other is secure, then if the most recent call to this method
         * invoked the code below to retrieve the topology from the secure
         * store, then the security information is stored in the system
         * properties and the state of the splits, and the client socket
         * factory used when communicating with the RMI registry while
         * retrieving the topology is configured for SSL communication. As
         * a result, if the current call to this method invokes the code below
         * to retrieve the topology of the non-secure store, and if the client
         * socket factory is not reconfigured for non-SSL communication, then
         * a KVServerException (wrapping a java.rmi.ConnectIOException) will
         * be encountered. To address this, KVStoreException is caught, the
         * client socket factory is reconfigured for non-SSL communication,
         * and the attempt to retrieve the topology is retried with no
         * security information.
         *
         * If both secure and non-secure attempts fail, then the stack trace
         * is sent to both the DataNode's stderr log file and the Hive CLI
         * display screen.
         */
        Topology topology;
        try {
            topology = TopologyLocator.get(kvHelperHosts, 0, loginMgr, kvStoreName);
        } catch (KVStoreException e) {

            if (passwordCredentials != null) {

                /* Retry with no security */
                LOG.debug("Failure on topology retrieval: attempt to "
                        + "communicate with RMI registry over SSL unsuccessful. "
                        + "Changing from SSLClientSocketFactory to " + "ClientSocketFactory and retrying ...");

                ClientSocketFactory.setRMIPolicy(null, kvStoreName);
                RegistryUtils.initRegistryCSF();
                try {
                    topology = TopologyLocator.get(kvHelperHosts, 0, null, kvStoreName);
                } catch (KVStoreException e1) {
                    e1.printStackTrace(); /* Send to DataNode's stderr file. */
                    throw new IOException(e1); /* Send to Hive CLI. */
                }

            } else {
                e.printStackTrace(); /* Send to DataNode's stderr file. */
                throw new IOException(e); /* Send to Hive CLI. */
            }
        }

        /* Create splits based on the store's partitions or its shards. */
        final List<TopoSplitWrapper> splits = getSplitInfo(topology, consistency, queryBy, shardKeyPartitionId);

        final List<InputSplit> ret = new ArrayList<InputSplit>(splits.size());
        for (TopoSplitWrapper ts : splits) {

            final TableInputSplit split = new TableInputSplit();

            split.setKVStoreName(kvStoreName);
            split.setKVHelperHosts(kvHelperHosts);
            split.setLocations(kvHadoopHosts);
            split.setTableName(tableName);
            split.setKVStoreSecurity(loginFlnm, passwordCredentials, trustFlnm);
            split.setPrimaryKeyProperty(primaryKeyProperty);

            /* For MultiRowOptions */
            split.setFieldRangeProperty(fieldRangeProperty);

            /* For TableIteratorOptions */
            split.setDirection(direction);
            split.setConsistency(consistency);
            split.setTimeout(timeout);
            split.setTimeoutUnit(timeoutUnit);
            split.setMaxRequests(maxRequests);
            split.setBatchSize(batchSize);
            split.setMaxBatches(maxBatches);

            split.setPartitionSets(ts.getPartitionSets());
            split.setQueryInfo(queryBy, whereClause);
            split.setShardSet(ts.getShardSet());

            ret.add(split);
        }
        return ret;
    }

    /**
     * Set the KV Store name for this InputFormat to operate on. This is
     * equivalent to passing the <code>oracle.kv.kvstore</code> Hadoop
     * property.
     *
     * @param newStoreName the new KV Store name to set
     */
    public static void setKVStoreName(String newStoreName) {
        TableInputFormatBase.kvStoreName = newStoreName;
    }

    /**
     * Set the KV Helper host:port pair(s) for this InputFormat to operate on.
     * This is equivalent to passing the <code>oracle.kv.hosts</code> Hadoop
     * property.
     *
     * @param newHelperHosts array of hostname:port strings of any hosts
     * in the KV Store.
     */
    public static void setKVHelperHosts(String[] newHelperHosts) {
        TableInputFormatBase.kvHelperHosts = newHelperHosts;
    }

    /**
     * Set the KV Hadoop data node host name(s) for this InputFormat
     * to operate on. This is equivalent to passing the
     * <code>oracle.kv.hadoop.hosts</code> property.
     *
     * @param newHadoopHosts array of hostname strings corresponding to the
     * names of the Hadoop data node hosts in the Hadoop cluster that this
     * InputFormat will use to support MapReduce jobs and/or service Hive
     * queries.
     */
    public static void setKVHadoopHosts(String[] newHadoopHosts) {
        TableInputFormatBase.kvHadoopHosts = newHadoopHosts;
    }

    /**
     * Set the name of the table in the KV store that this InputFormat
     * will operate on. This is equivalent to passing the
     * <code>oracle.kv.tableName</code> property.
     *
     * @param newTableName the new table name to set.
     */
    public static void setTableName(String newTableName) {
        TableInputFormatBase.tableName = newTableName;
    }

    /**
     * Sets the String to use for the property value whose contents are used
     * to construct the primary key to employ when iterating the table. The
     * format of the String input to this method must be a comma-separated
     * String of the form:
     * <code>
     *   fieldName,fieldValue,fieldType,fieldName,fieldValue,fieldType,..
     * </code>
     * where the number of elements separated by commas must be a multiple
     * of 3, and each fieldType must be 'STRING', 'INTEGER', 'LONG', 'FLOAT',
     * 'DOUBLE', or 'BOOLEAN'. Additionally, the values referenced by the
     * various fieldType and fieldValue components of this String must
     * satisfy the semantics of PrimaryKey for the given table; that is,
     * they must represent a first-to-last subset of the table's primary
     * key fields, and they must be specified in the same order as those
     * primary key fields. If the String referenced by this property
     * does not satisfy these requirements, a full primary key wildcard
     * will be used when iterating the table.
     * <p>
     * This is equivalent to passing the <code>oracle.kv.primaryKey</code>
     * Hadoop property.
     *
     * @param newProperty the new shard key property to set
     */
    public static void setPrimaryKeyProperty(String newProperty) {
        TableInputFormatBase.primaryKeyProperty = newProperty;
    }

    /* Methods related to MultiRowOptions */

    /**
     * Sets the String to use for the property value whose contents are used
     * to construct the field range to employ when iterating the table. The
     * format of this property's value must be a list of name:value pairs in
     * JSON FORMAT like the following:
     * <code>
     *   -Doracle.kv.fieldRange="{\"name\":\"fieldName\",
     *      \"start\":\"startVal\",[\"startInclusive\":true|false],
     *      \"end\"\"endVal\",[\"endInclusive\":true|false]}"
     * </code>
     * where for the given field over which to range, the 'start', and 'end'
     * components are required, and the 'startInclusive' and 'endInclusive'
     * components are optional; defaulting to 'true' if not included. Note
     * that the list itself is enclosed in un-escaped double quotes and
     * corresponding curly brace; and each name component and string type
     * value component is enclosed in ESCAPED double quotes.
     * <p>
     * In addition to the JSON format requirement above, the values referenced
     * by the components of this Property's value must also satisfy the
     * semantics of FieldRange; that is,
     * <ul>
     *   <li>the values associated with the target key must correspond to a
     *       valid primary key in the table
     *   <li>the value associated with the fieldName must be the name of a
     *       valid field of the primary key over which iteration will be
     *       performed
     *   <li>the values associated with the start and end of the range must
     *       correspond to valid values of the given fieldName
     *   <li>the value associated with either of the inclusive components
     *       must be either 'true' or 'false'
     * </ul>
     * If the components of this property do not satisfy these requirements,
     * then table iteration will be performed over the full range of values
     * of the PrimaryKey iteration rather than a sub-range.
     * <p>
     * This is equivalent to passing the <code>oracle.kv.fieldRange</code>
     * Hadoop property.
     *
     * @param newProperty the new field range property to set
     */
    public static void setFieldRangeProperty(String newProperty) {
        TableInputFormatBase.fieldRangeProperty = newProperty;
    }

    /* Methods related to TableIteratorOptions */

    /**
     * Specifies the order in which records are returned by the InputFormat.
     * Note that when doing PrimaryKey iteration, only Direction.UNORDERED
     * is allowed.
     *
     * @param newDirection the direction to retrieve data
     */
    public static void setDirection(Direction newDirection) {
        TableInputFormatBase.direction = newDirection;
    }

    /**
     * Specifies the read consistency associated with the lookup of the child
     * KV pairs.  Version- and Time-based consistency may not be used.  If
     * null, the default consistency is used.  This is equivalent to passing
     * the <code>oracle.kv.consistency</code> Hadoop property.
     *
     * @param consistency the consistency
     */
    @SuppressWarnings("deprecation")
    public static void setConsistency(Consistency consistency) {
        if (consistency == Consistency.ABSOLUTE || consistency == Consistency.NONE_REQUIRED_NO_MASTER
                || consistency == Consistency.NONE_REQUIRED || consistency == null) {
            TableInputFormatBase.consistency = consistency;
        } else {
            throw new IllegalArgumentException(
                    "Consistency may only be ABSOLUTE, " + "NONE_REQUIRED_NO_MASTER, or NONE_REQUIRED");
        }
    }

    /**
     * Specifies an upper bound on the time interval for processing a
     * particular KV retrieval.  A best effort is made to not exceed the
     * specified limit.  If zero, the default request timeout is used.  This is
     * equivalent to passing the <code>oracle.kv.timeout</code> Hadoop
     * property.
     *
     * @param timeout the timeout
     */
    public static void setTimeout(long timeout) {
        TableInputFormatBase.timeout = timeout;
    }

    /**
     * Specifies the unit of the timeout parameter.  It may be null only if
     * timeout is zero.  This is equivalent to passing the
     * <code>oracle.kv.timeout</code> Hadoop property.
     *
     * @param timeoutUnit the timeout unit
     */
    public static void setTimeoutUnit(TimeUnit timeoutUnit) {
        TableInputFormatBase.timeoutUnit = timeoutUnit;
    }

    /**
     * Specifies the maximum number of client side threads to use when running
     * an iteration; where a value of 1 causes the iteration to be performed
     * using only the current thread, and a value of 0 causes the client to
     * base the number of threads to employ on the current store topology.
     * <p>
     * This is equivalent to passing the <code>oracle.kv.maxRequests</code>
     * Hadoop property.
     *
     * @param newMaxRequests the suggested number of threads to employ when
     * an iteration.
     */
    public static void setMaxRequests(int newMaxRequests) {
        TableInputFormatBase.maxRequests = newMaxRequests;
    }

    /**
     * Specifies the suggested number of keys to fetch during each network
     * round trip by the InputFormat.  If 0, an internally determined default
     * is used.  This is equivalent to passing the
     * <code>oracle.kv.batchSize</code> Hadoop property.
     *
     * @param batchSize the suggested number of keys to fetch during each
     * network round trip.
     */
    public static void setBatchSize(int batchSize) {
        TableInputFormatBase.batchSize = batchSize;
    }

    /**
     * Specifies the maximum number of result batches that can be held in
     * memory on the client side before processing on the server side
     * pauses. This parameter can be used to prevent the client side memory
     * from being exceeded if the client cannot consume results as fast as
     * they are generated by the server side.
     * <p>
     * This is equivalent to passing the <code>oracle.kv.maxBatches</code>
     * Hadoop property.
     *
     * @param newMaxBatches the suggested number of threads to employ when
     * an iteration.
     */
    public static void setMaxBatches(int newMaxBatches) {
        TableInputFormatBase.maxBatches = newMaxBatches;
    }

    /**
     * Sets the login properties file and the public trust file (keys
     * and/or certificates), as well as the <code>PasswordCredentials</code>
     * for authentication. The value of the <code>loginFile</code> and
     * <code>trustFile</code> parameters must be either a fully qualified
     * path referencing a file located on the local file system, or the
     * name of a file (no path) whose contents can be retrieved as a
     * resource from the current VM's classpath.
     * <p>
     * Note that this class provides the <code>getSplits</code> method;
     * which must be able to contact a secure store, and so will need
     * access to local copies of the login properties and trust files.
     * As a result, if the values input for the <code>loginFile</code> and
     * <code>trustFile</code> parameters are simple file names rather
     * than fully qualified paths, this method will retrieve the contents
     * of each from the classpath and generate private, local copies of
     * the associated file for availability to the <code>getSplits</code>
     * method.
     */
    public static void setKVSecurity(final String loginFile, final PasswordCredentials userPasswordCredentials,
            final String trustFile) throws IOException {

        setLocalKVSecurity(loginFile, userPasswordCredentials, trustFile);
    }

    private void initializeParameters(Configuration conf) throws IOException {
        /*
         * Must always reinitialize all fields of this class; because there
         * are use cases in which the field values of this class change over
         * time. For example, if this class is employed as the InputFormat
         * for a Hive session, then each Hive query can/may specify a
         * different tableName and/or kvStoreName and/or kvHelperHosts.
         * If these values are not reinitialized on each call to this
         * method, then because the fields are static, the values from the
         * previous query will be incorrectly used for the current query;
         * which can result in errors or incorrect results.
         *
         * On the other hand, for a basic MapReduce job, the values of the
         * static fields are set once (via either command line arguments or
         * a system property), when the job is initiated . In that case, the
         * system property will not survive the serialization/deserialization
         * process and thus will return null below. For the purposes of this
         * method then, a null system property is taken to be an indication
         * that this class is being employed in a MapReduce job initiated
         * from somewhere other than a hive query (ex. the command line).
         * And it is then assumed that the static fields processed by this
         * method must have been set during job initiation and that each
         * field's value survived its journey from the job's client side to
         * the MapReduce server side.
         */
        if (conf != null) {

            final String kvStoreNameProp = conf.get(ParamConstant.KVSTORE_NAME.getName());
            if (kvStoreNameProp != null) {
                kvStoreName = kvStoreNameProp;
            }

            final String helperHosts = conf.get(ParamConstant.KVSTORE_NODES.getName());
            if (helperHosts != null) {
                kvHelperHosts = helperHosts.trim().split(",");
            }

            final String hadoopHosts = conf.get(ParamConstant.KVHADOOP_NODES.getName());
            if (hadoopHosts != null) {
                kvHadoopHosts = hadoopHosts.trim().split(",");
            } else {
                if (kvHelperHosts != null) {
                    kvHadoopHosts = new String[kvHelperHosts.length];
                    for (int i = 0; i < kvHelperHosts.length; i++) {
                        /* Strip off the ':port' suffix */
                        final String[] hostPort = (kvHelperHosts[i]).trim().split(":");
                        kvHadoopHosts[i] = hostPort[0];
                    }
                }
            }

            final String tableNameProp = conf.get(ParamConstant.TABLE_NAME.getName());
            if (tableNameProp != null) {
                tableName = tableNameProp;
            }

            final String primaryKeyProp = conf.get(ParamConstant.PRIMARY_KEY.getName());
            if (primaryKeyProp != null) {
                primaryKeyProperty = primaryKeyProp;
            }

            /* For MultiRowOptions. */
            final String fieldRangeProp = conf.get(ParamConstant.FIELD_RANGE.getName());
            if (fieldRangeProp != null) {
                fieldRangeProperty = fieldRangeProp;
            }

            /*
             * For TableIteratorOptions. Note that when doing PrimaryKey
             * iteration, Direction must be UNORDERED.
             */

            final String consistencyStr = conf.get(ParamConstant.CONSISTENCY.getName());
            if (consistencyStr != null) {
                consistency = ExternalDataSourceUtils.parseConsistency(consistencyStr);
            }

            final String timeoutParamName = ParamConstant.TIMEOUT.getName();
            final String timeoutStr = conf.get(timeoutParamName);
            if (timeoutStr != null) {
                timeout = ExternalDataSourceUtils.parseTimeout(timeoutStr);
                timeoutUnit = TimeUnit.MILLISECONDS;
            }

            final String maxRequestsStr = conf.get(ParamConstant.MAX_REQUESTS.getName());
            if (maxRequestsStr != null) {
                try {
                    maxRequests = Integer.parseInt(maxRequestsStr);
                } catch (NumberFormatException NFE) {
                    throw new IllegalArgumentException(
                            "Invalid value for " + ParamConstant.MAX_REQUESTS.getName() + ": " + maxRequestsStr);
                }
            }

            final String batchSizeStr = conf.get(ParamConstant.BATCH_SIZE.getName());
            if (batchSizeStr != null) {
                try {
                    batchSize = Integer.parseInt(batchSizeStr);
                } catch (NumberFormatException NFE) {
                    throw new IllegalArgumentException(
                            "Invalid value for " + ParamConstant.BATCH_SIZE.getName() + ": " + batchSizeStr);
                }
            }

            final String maxBatchesStr = conf.get(ParamConstant.MAX_BATCHES.getName());
            if (maxBatchesStr != null) {
                try {
                    maxBatches = Integer.parseInt(maxBatchesStr);
                } catch (NumberFormatException NFE) {
                    throw new IllegalArgumentException(
                            "Invalid value for " + ParamConstant.MAX_BATCHES.getName() + ": " + maxBatchesStr);
                }
            }

            /* Handle the properties related to security. */
            final String loginFile = conf.get(KVSecurityConstants.SECURITY_FILE_PROPERTY);
            final String trustFile = conf.get(KVSecurityConstants.SSL_TRUSTSTORE_FILE_PROPERTY);
            final String username = conf.get(KVSecurityConstants.AUTH_USERNAME_PROPERTY);
            final String passwordStr = conf.get(ParamConstant.AUTH_USER_PWD_PROPERTY.getName());

            /* Create the PasswordCredentials needed to contact the store. */
            PasswordCredentials passwordCreds = null;
            if (username != null && passwordStr != null) {
                final char[] userPassword = passwordStr.toCharArray();
                passwordCreds = new PasswordCredentials(username, userPassword);
            }

            if (passwordCreds == null) {
                String passwordLoc = conf.get(KVSecurityConstants.AUTH_WALLET_PROPERTY);
                PasswordManager storeMgr = null;
                if (passwordLoc != null) {

                    /* Retrieve the password from the given wallet. */
                    final File walletDirFd = new File(passwordLoc);
                    try {
                        storeMgr = PasswordManager.load(PasswordManager.WALLET_MANAGER_CLASS);
                    } catch (Exception e) {
                        e.printStackTrace(); /* Send to DataNode stderr file */
                        throw new IOException(e); /* Send to Hive CLI. */
                    }
                    final PasswordStore fileStore = storeMgr.getStoreHandle(walletDirFd);
                    fileStore.open(null);
                    final Collection<String> secretAliases = fileStore.getSecretAliases();
                    final Iterator<String> aliasItr = secretAliases.iterator();
                    final char[] userPassword = (aliasItr.hasNext() ? fileStore.getSecret(aliasItr.next()) : null);
                    if (username != null) {
                        passwordCreds = new PasswordCredentials(username, userPassword);
                    }
                    fileStore.discard();
                } else {
                    passwordLoc = conf.get(KVSecurityConstants.AUTH_PWDFILE_PROPERTY);
                    if (passwordLoc != null) {

                        /* Retrieve password from the given password file. */
                        final File passwordFileFd = new File(passwordLoc);
                        try {
                            storeMgr = PasswordManager.load(PasswordManager.FILE_STORE_MANAGER_CLASS);
                        } catch (Exception e) {
                            e.printStackTrace(); /* Send to DataNode stderr. */
                            throw new IOException(e); /* Send to Hive CLI. */
                        }
                        final PasswordStore fileStore = storeMgr.getStoreHandle(passwordFileFd);
                        fileStore.open(null);
                        final Collection<String> secretAliases = fileStore.getSecretAliases();
                        final Iterator<String> aliasItr = secretAliases.iterator();
                        final char[] userPassword = (aliasItr.hasNext() ? fileStore.getSecret(aliasItr.next())
                                : null);
                        if (username != null) {
                            passwordCreds = new PasswordCredentials(username, userPassword);
                        }
                        fileStore.discard();
                    }
                }
            }
            setLocalKVSecurity(loginFile, passwordCreds, trustFile);
        }
    }

    /**
     * Set/create the artifacts required to connect to and interact
     * with a secure store; specifically, a login properties file,
     * a trust file containing public keys and/or certificates, and
     * <code>PasswordCredentials</code>. If the value input for the
     * <code>loginFile</code> and <code>trustFile</code> parameter
     * is a fully-qualified path, then this method initializes the
     * corresponding static variables to those values so that the
     * <code>getSplits</code> method can contact a secure store, extracts
     * the filenames from those paths, uses those values to initialize
     * the corresponding static filename variables used to initialize
     * the splits that are created, and returns.
     * <p>
     * If the value input for the <code>loginFile</code> and
     * <code>trustFile</code> parameter is not a fully-qualified path,
     * then this method uses the given file names to retrieve the contents
     * of the associated login file and trust file as resources from
     * the classpath, and writes that information to corresponding files
     * on the local file system (in a directory owned by the user under
     * which the application is executed). After generating the local
     * files, the fully-qualified paths to those files are used to
     * initialize the corresponding static variables so that the
     * <code>getSplits</code> method can contact a secure store.
     */
    private static void setLocalKVSecurity(final String loginFile,
            final PasswordCredentials userPasswordCredentials, final String trustFile) throws IOException {

        if (loginFile == null) {
            return;
        }

        if (userPasswordCredentials == null) {
            return;
        }

        if (trustFile == null) {
            return;
        }

        final File loginFd = new File(loginFile);
        boolean loginIsAbsolute = false;
        if (loginFd.isAbsolute()) {
            loginIsAbsolute = true;
            TableInputFormatBase.localLoginFile = loginFile;
            TableInputFormatBase.loginFlnm = loginFd.getName();
        } else {
            TableInputFormatBase.loginFlnm = loginFile;
        }

        TableInputFormatBase.passwordCredentials = userPasswordCredentials;

        final File trustFd = new File(trustFile);
        boolean trustIsAbsolute = false;
        if (trustFd.isAbsolute()) {
            trustIsAbsolute = true;
            TableInputFormatBase.trustFlnm = trustFd.getName();
        } else {
            TableInputFormatBase.trustFlnm = trustFile;
        }

        if (loginIsAbsolute && trustIsAbsolute) {
            return;
        }

        /*
         * If loginFile and/or trustFile is a filename and not an absolute
         * path, then generate local versions of the file.
         */
        final File userSecurityDirFd = new File(USER_SECURITY_DIR);
        if (!userSecurityDirFd.exists()) {
            if (!userSecurityDirFd.mkdirs()) {
                throw new IOException("failed to create " + userSecurityDirFd);
            }
        }

        final ClassLoader cl = TableInputFormatBase.class.getClassLoader();

        if (!loginIsAbsolute) {

            InputStream loginStream = null;
            if (cl != null) {
                loginStream = cl.getResourceAsStream(loginFlnm);
            } else {
                loginStream = ClassLoader.getSystemResourceAsStream(loginFlnm);
            }

            /*
             * Retrieve the login configuration as a resource from the
             * classpath, and write that information to the user's local
             * file system. But exclude any properties related to user
             * authorization; that is, exclude all properties of the form
             * 'oracle.kv.auth.*", to prevent those property values from
             * being sent and cached on the DataNodes.
             */
            final Properties loginProps = new Properties();
            if (loginStream != null) {
                loginProps.load(loginStream);
            }

            /* Exclude 'oracle.kv.auth.*" properties. */
            loginProps.remove(KVSecurityConstants.AUTH_USERNAME_PROPERTY);
            loginProps.remove(KVSecurityConstants.AUTH_WALLET_PROPERTY);
            loginProps.remove(KVSecurityConstants.AUTH_PWDFILE_PROPERTY);

            /* Strip off the path of the trust file. */
            final String trustProp = loginProps.getProperty(KVSecurityConstants.SSL_TRUSTSTORE_FILE_PROPERTY);
            if (trustProp != null) {
                final File trustPropFd = new File(trustProp);
                if (!trustPropFd.exists()) {
                    loginProps.setProperty(KVSecurityConstants.SSL_TRUSTSTORE_FILE_PROPERTY, trustPropFd.getName());
                }
            }

            final File absoluteLoginFd = new File(USER_SECURITY_DIR + FILE_SEP + loginFlnm);
            final FileOutputStream loginFos = new FileOutputStream(absoluteLoginFd);
            try {
                loginProps.store(loginFos, null);
            } finally {
                loginFos.close();
            }

            TableInputFormatBase.localLoginFile = absoluteLoginFd.toString();
        }

        if (!trustIsAbsolute) {

            InputStream trustStream = null;
            if (cl != null) {
                trustStream = cl.getResourceAsStream(trustFlnm);
            } else {
                trustStream = ClassLoader.getSystemResourceAsStream(trustFlnm);
            }

            /*
             * Retrieve the trust credentials as a resource from the classpath,
             * and write that information to the user's local file system.
             */
            final File absoluteTrustFd = new File(USER_SECURITY_DIR + FILE_SEP + trustFlnm);
            final FileOutputStream trustFlnmFos = new FileOutputStream(absoluteTrustFd);

            try {
                int nextByte = trustStream.read();
                while (nextByte != -1) {
                    trustFlnmFos.write(nextByte);
                    nextByte = trustStream.read();
                }
            } finally {
                trustFlnmFos.close();
            }
        }
    }

    public void setQueryInfo(final int newQueryBy, final String newWhereClause, final Integer newPartitionId) {

        queryBy = newQueryBy;
        whereClause = newWhereClause;
        shardKeyPartitionId = newPartitionId;
    }

    /**
     * Convenience method that returns a list whose elements encapsulate
     * the information needed to create the necessary splits; based on
     * whether a TableScan (partition based) or an IndexScan (shard based)
     * will be used to satisfy the read request (query).
     */
    private List<TopoSplitWrapper> getSplitInfo(final Topology topology, final Consistency readConsistency,
            final int whereQueryBy, final Integer singlePartitionId) {

        final List<TopoSplitWrapper> retList = new ArrayList<TopoSplitWrapper>();

        if (topology == null) {
            return retList;
        }

        /* Determine how the splits should be generated. */
        boolean buildSplits = false;
        boolean singleSplit = false;
        int singleId = 1;

        switch (whereQueryBy) {

        case TableInputSplit.QUERY_BY_PRIMARY_ALL_PARTITIONS:
        case TableInputSplit.QUERY_BY_ONQL_ALL_PARTITIONS:

            buildSplits = true;
            break;

        case TableInputSplit.QUERY_BY_PRIMARY_SINGLE_PARTITION:
        case TableInputSplit.QUERY_BY_ONQL_SINGLE_PARTITION:

            if (singlePartitionId == null) {
                buildSplits = true;
            } else {
                singleSplit = true;
                singleId = singlePartitionId.intValue();
            }
            break;

        default:

            /* Skip partition based splits for shard based splits. */
            break;
        }

        /*
         * If a table scan (ALL_PARTITIONS) will be used when pushing the
         * predicate, then the SplitBuilder is first used to compute disjoint
         * subsets of the store's partitions, wrapped in a TopoSplit class.
         * Then a TopoSplitWrapper is used to map each of those partition
         * subsets to the shards corresponding to the partitions of the
         * subset; so that one split per partition subset is created.
         */
        if (buildSplits) {

            final SplitBuilder sb = new SplitBuilder(topology);
            final List<TopoSplit> topoSplits = sb.createShardSplits(readConsistency);

            for (TopoSplit topoSplit : topoSplits) {
                final Set<RepGroupId> shardSet = new HashSet<RepGroupId>();
                final List<Set<Integer>> partitionSets = topoSplit.getPartitionSets();
                for (Set<Integer> partitionIds : partitionSets) {
                    for (Integer pId : partitionIds) {
                        final PartitionId partitionId = new PartitionId(pId);
                        final RepGroupId repGroupId = topology.getRepGroupId(partitionId);
                        shardSet.add(repGroupId);
                    }
                }
                retList.add(new TopoSplitWrapper(topoSplit, shardSet));
            }
            return retList;
        }

        /* For executing the query against a SINGLE_PARTITION. */
        if (singleSplit) {

            final Set<Integer> partitionSet = new HashSet<Integer>();
            partitionSet.add(singlePartitionId);

            final Set<RepGroupId> shardSet = new HashSet<RepGroupId>();
            shardSet.add(topology.getRepGroupId(new PartitionId(singleId)));

            retList.add(new TopoSplitWrapper(new TopoSplit(0, partitionSet), shardSet));

            return retList;
        }

        /*
         * Either QUERY_BY_PRIMARY_ALL_SHARDS or QUERY_BY_ONQL_ALL_SHARDS
         * remains. For either case, since an index is involved, simply
         * return the store's shards; so that one split per shard is
         * created.
         */
        final Set<RepGroupId> shardIds = topology.getRepGroupIds();
        for (RepGroupId shardId : shardIds) {
            final Set<RepGroupId> shardSet = new HashSet<RepGroupId>();
            shardSet.add(shardId);
            retList.add(new TopoSplitWrapper(null, shardSet));
        }
        return retList;
    }

    /**
     * Convenience class that wraps a TopoSplit containing a subset of the
     * store's partitions and a set containing one of the store's shards.
     */
    private static class TopoSplitWrapper {

        private final TopoSplit topoSplit;
        private final Set<RepGroupId> shardSet;

        TopoSplitWrapper(TopoSplit topoSplit, Set<RepGroupId> shardSet) {
            this.topoSplit = topoSplit;
            this.shardSet = shardSet;
        }

        List<Set<Integer>> getPartitionSets() {
            if (topoSplit != null) {
                return topoSplit.getPartitionSets();
            }
            /* Avoid NPE in write method during split serialization. */
            return Collections.emptyList();
        }

        Set<RepGroupId> getShardSet() {
            if (shardSet != null) {
                return shardSet;
            }
            /* Avoid NPE in write method during split serialization. */
            return Collections.emptySet();
        }
    }
}