hadoop.table.CountTableRows.java Source code

Introduction

Here is the source code for hadoop.table.CountTableRows.java
Source

/*-
 *
 *  This file is part of Oracle NoSQL Database
 *  Copyright (C) 2011, 2015 Oracle and/or its affiliates.  All rights reserved.
 *
 *  Oracle NoSQL Database is free software: you can redistribute it and/or
 *  modify it under the terms of the GNU Affero General Public License
 *  as published by the Free Software Foundation, version 3.
 *
 *  Oracle NoSQL Database is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public
 *  License in the LICENSE file along with Oracle NoSQL Database.  If not,
 *  see <http://www.gnu.org/licenses/>.
 *
 *  An active Oracle commercial licensing agreement for this product
 *  supercedes this license.
 *
 *  For more information please contact:
 *
 *  Vice President Legal, Development
 *  Oracle America, Inc.
 *  5OP-10
 *  500 Oracle Parkway
 *  Redwood Shores, CA 94065
 *
 *  or
 *
 *  berkeleydb-info_us@oracle.com
 *
 *  [This line intentionally left blank.]
 *  [This line intentionally left blank.]
 *  [This line intentionally left blank.]
 *  [This line intentionally left blank.]
 *  [This line intentionally left blank.]
 *  [This line intentionally left blank.]
 *  EOF
 *
 */

package hadoop.table;

import java.io.IOException;
import java.util.List;
import java.util.logging.Logger;

import oracle.kv.ParamConstant;
import oracle.kv.hadoop.table.TableInputFormat;
import oracle.kv.table.PrimaryKey;
import oracle.kv.table.Row;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * A basic example demonstrating how to use the class
 * <a href="../../../javadoc/oracle/kv/hadoop/table/TableInputFormat.html"><b><code>oracle.kv.hadoop.table.TableInputFormat</code></b></a> to access the
 * rows of a table in an Oracle NoSQL Database from within a Hadoop
 * MapReduce job for the purpose of counting the number of records
 * in the table.
 * <p>
 * The <code>map</code> function is passed the PrimaryKey and Row for each
 * record in the KV Store and outputs key/value pairs containing the component
 * field names of the PrimaryKey as the output key and a value of 1. The
 * reduce phase then uses the output of the map phase to count the number
 * of records in the table. This MapReduce task is similar to the ubiquitous
 * Hadoop MapReduce WordCount example.
 * <p>
 * The <code>TableInputFormat</code> and related classes are located in the
 * <code>lib/kvclient.jar</code> file, so kvclient.jar must be included in
 * the Hadoop <em>libjars</em> classpath at runtime. Additionally, this class
 * and any dependent classes, must be placed in a separate JAR for inclusion
 * in the classpath of the application itself. Finally, if this example is to
 * be run against a secure store, then a third JAR file must be generated that
 * contains the user's login file, credentials, and trust store. For the
 * details on how to run this example class, both in a secure and a non-secure
 * mode, refer to the information presented in the
 * <a href="./package-summary.html#count_table_rows_example"><b><code>package summary</code></b></a> for this example.
 */
public class CountTableRows extends Configured implements Tool {

    private static final Class<?> THIS_CLASS = CountTableRows.class;
    private static final String THIS_CLASS_NAME = THIS_CLASS.getName();

    /*
     * The logger employed by the frontend client of the MapReduce job
     * initiated by this example class (the class that encapsulates the
     * MapReduce job's Mapper task and Reducer task). Generally, for
     * example purposes, one would configure this logger to write its
     * output to the console. Compare this logger with the logger defined
     * below, in the Map nested class.
     */
    private static Logger logger;

    /**
     * Nested class used to map input key/value pairs to a set of
     * intermediate key/value pairs in preparation for the reduce phase
     * of the associated MapReduce job. The Hadoop MapReduce framework
     * spawns one task -- running an instance of this class -- for each
     * <a href="../../../javadoc/oracle/kv/hadoop/table/TableInputSplit.html"><b><code>TableInputSlit</code></b></a>
     * generated by the
     * <a href="../../../javadoc/oracle/kv/hadoop/table/TableInputFormat.html"><b><code>TableInputFormat</code></b></a> employed by the job.
     */
    public static class Map extends Mapper<PrimaryKey, Row, Text, IntWritable> {

        /*
         * Logger for logging output from each Mapper task of the MapReduce
         * job that is executed. This logger must be different from the logger
         * employed by the job's frontend client class (the encapsulating
         * class of this Map task); not only because the logger in the
         * encapsulating class is not in the scope of this class, but also
         * because this logger is an Apache Commons logger, whereas the
         * logger employed by the encapsulating class is a java.util.logging
         * logger.
         *
         * Note that because the Mapper task(s) run on the Hadoop cluster's
         * DataNodes, the output of this logger is typically written to the
         * file system of the DataNode on which the associated Mapper task
         * executes. For example, if the Hadoop DataNodes are configured
         * to log to the directory /var/log/hadoop/node_manager/logs, then
         * under that directory, one would see additional directories and
         * files of the form:<br>
         * <code>
         *   /var/log/hadoop/node_manager/logs
         *     /application_1409172332346_0088
         *       /container_1409172332346_0088_01_000005
         *         /stderr
         *         /stdout
         *         /syslog
         * </code><br>
         * where the output of this logger is written to the syslog file;
         * and output for calls to System.err.println and System.out.println
         * is written to the stderr and stdout files respectively.
         */
        private static final Log MAP_TASK_LOGGER = LogFactory.getLog(THIS_CLASS_NAME);

        private Text word = new Text();
        private static final IntWritable one = new IntWritable(1);

        @Override
        public void map(PrimaryKey keyArg, Row valueArg, Context context) throws IOException, InterruptedException {

            /*
             * keyArg is a NoSQL Databse PrimaryKey.
             *
             * The Output is the Text object consisting of the component
             * field names of the PrimaryKey, concatenated and delimited
             * by "/"; which is the Map/Reduce key and 1 as the Map/Reduce
             * value.
             */
            final List<String> keyFieldComponents = keyArg.getFields();
            MAP_TASK_LOGGER.info("PrimaryKey field components: " + keyFieldComponents);
            final StringBuilder wordBuf = new StringBuilder();
            for (String keyFieldComponent : keyFieldComponents) {
                wordBuf.append("/" + keyFieldComponent);
            }
            word.set(wordBuf.toString());
            MAP_TASK_LOGGER.info("MapReduce key: " + word);
            context.write(word, one);
        }
    }

    /**
     * Nested class used to reduce to a smaller set of values, the set of
     * intermediate values, sharing a key, which were produced by the
     * mapping phase of the job.
     */
    public static class Reduce extends IntSumReducer<Text> {
    }

    @Override
    @SuppressWarnings("deprecation")
    public int run(String[] args) throws Exception {

        final Configuration conf = getConf();

        /*
         * Instantiate the logger using the configuration info (if any)
         * specified on the command line. The 'hadoop' command line
         * interpreter places the values of the desired system properties
         * in the Configuration object above, not the system properties on
         * the JVM. As a result, because the logger retrieves its config
         * from the JVM's system properties (not the MapReduce Configuration),
         * the desired logger can be instantiated only after the necessary
         * logger config values are retrieved from the MapReduce config
         * and placed in the system properties of the JVM.
         *
         * Note that once the logger configuration system properties are set
         * on the JVM, they are available to all other loggers that may be
         * created by classes instantiated in the JVM; for example, any
         * utility classes employed by this class.
         */
        if (conf != null) {
            final String loggingConfig = conf.get("java.util.logging.config.file");
            if (loggingConfig != null) {
                System.setProperty("java.util.logging.config.file", loggingConfig);
            }

            final String loggingFormat = conf.get("java.util.logging.SimpleFormatter.format");
            if (loggingFormat != null) {
                System.setProperty("java.util.logging.SimpleFormatter.format", loggingFormat);
            }
        }

        /* Since the JVM properties are set, the logger can now be created. */
        logger = Logger.getLogger(THIS_CLASS_NAME);

        final Job job = new Job(conf);
        job.setJarByClass(CountTableRows.class);
        job.setJobName("Count Table Rows");

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setInputFormatClass(TableInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        TableInputFormat.setKVStoreName(args[0]);
        TableInputFormat.setKVHelperHosts(new String[] { args[1] });
        TableInputFormat.setTableName(args[2]);

        FileOutputFormat.setOutputPath(job, new Path(args[3]));

        /*
         * Handle the optional client-side security information. If
         * accessing a secure store, then this information must be
         * specified using the last 2 arguments input on the command
         * line; where,
         * args[4] = name of the client side login file
         * args[5] = name of the server side login file
         */
        if (args.length >= 6) {
            logger.info("using a SECURE KVStore [" + args[4] + ", " + args[5] + "]");
            KVSecurityUtil.createLocalKVSecurity(args[4], args[5]);
            TableInputFormat.setKVSecurity(KVSecurityUtil.getServerLoginFlnm(),
                    KVSecurityUtil.getPasswordCredentials(), KVSecurityUtil.getClientTrustFlnm());
        } else {
            logger.info("using a NON-SECURE KVStore");
        }

        if (conf != null) {
            final String primaryKeyProperty = conf.get(ParamConstant.PRIMARY_KEY.getName());
            if (primaryKeyProperty != null) {
                TableInputFormat.setPrimaryKeyProperty(primaryKeyProperty);
            }
        }

        final boolean success = job.waitForCompletion(true);
        return success ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {

        final int ret = ToolRunner.run(new CountTableRows(), args);
        System.exit(ret);
    }
}