com.cloudera.accumulo.upgrade.compatibility.DataCompatibilityLoad.java Source code

Introduction

Here is the source code for com.cloudera.accumulo.upgrade.compatibility.DataCompatibilityLoad.java
Source

/*
 * Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.accumulo.upgrade.compatibility;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;

import static org.apache.accumulo.core.conf.Property.*;
import org.apache.accumulo.core.client.TableExistsException;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.admin.TableOperations;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.ColumnVisibility;

import com.cloudera.accumulo.upgrade.util.ConnectionCli;
import com.cloudera.accumulo.upgrade.util.MapreduceOutputCli;
import com.cloudera.accumulo.upgrade.util.Cli;
import com.cloudera.accumulo.upgrade.util.VisibilityCli;

import com.google.common.collect.ImmutableMap;
import com.google.common.primitives.Longs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.Counter;

import org.apache.log4j.Logger;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;

/**
 * MapReduce job to load tables in a variety of Accumulo serializations.
 *
 * If you enable LZO, you must have the codec for Hadoop properly installed.
 *
 * Creates several tables, flexing Accumulo's serialization options:
 *
 * <ul>
 *   <li>Bloom filter on, uncompressed, block cache on
 *   <li>Bloom filter on, snappy compressed, block cache off
 *   <li>Bloom filter off, snappy compressed, block cache on
 *   <li>Bloom filter off, gz compressed, block cache off
 *   <li><em>optionally</em> Bloom filter off, lzo compressed, block cache off
 * </ul>
 *
 * Then writes a user specified amount of data into each of said tables.
 *
 * Data is written in the form
 *
 * |  row            |   CF    |  CQ                   |          Vis                      |   timestamp   | Value      |
 * | reverse task id |  [A-Z]  |  text one up counter  |  user specified (default blank)   |   now()       |  sha1(key) |
 *
 * With an optional visibility specified by the user.
 *
 * @see DataCompatiblityVerify
 * @see TaskID
 */
public class DataCompatibilityLoad extends Configured implements Tool {

    private static final Logger log = Logger.getLogger(DataCompatibilityLoad.class);

    protected static final String OUTPUT_TABLE_NAMES = DataCompatibilityLoad.class.getName()
            + ".output-table-names";
    protected static final String VISIBILITY = DataCompatibilityLoad.class.getName() + ".visibility";
    protected static final String[] FAMILIES = { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
            "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" };

    /**
     * Mapper that outputs directly to all of the configured tables.
     */
    public static class DataLoadMapper extends Mapper<Text, LongWritable, Text, Mutation> {

        private final Text row = new Text();
        private ColumnVisibility visibility;
        private final Text qualifier = new Text();
        private final MessageDigest digest;
        private final Value value = new Value();
        private final Text out = new Text();
        private Collection<String> tables;

        {
            MessageDigest temp = null;
            try {
                temp = MessageDigest.getInstance("SHA-1");
            } catch (NoSuchAlgorithmException exception) {
                throw new RuntimeException("Your JVM doesn't have a SHA1 implementation, which we need to run. :(",
                        exception);
            } finally {
                digest = temp;
            }
        }

        private Counter rowCells;

        @Override
        public void map(Text family, LongWritable numCells, Context context)
                throws IOException, InterruptedException {
            final Counter familyCells = context.getCounter(DataLoadMapper.class.getName(),
                    "family_" + family.toString());
            final Mutation mutation = new Mutation(row);
            for (long i = 0; i < numCells.get(); i++) {
                long timestamp = System.currentTimeMillis();
                qualifier.set(Long.toString(i));
                digest.reset();
                digest.update(mutation.getRow());
                digest.update(family.getBytes(), 0, family.getLength());
                digest.update(qualifier.getBytes(), 0, qualifier.getLength());
                digest.update(visibility.getExpression());
                digest.update(Longs.toByteArray(timestamp));
                value.set(digest.digest());
                mutation.put(family, qualifier, visibility, timestamp, value);
                familyCells.increment(1l);
                rowCells.increment(1l);
            }
            context.progress();
            if (tables.isEmpty()) {
                /* Use the default table */
                context.write(null, mutation);
            } else {
                for (String table : tables) {
                    out.set(table);
                    context.write(out, mutation);
                }
            }
        }

        @Override
        public void setup(Context context) throws IOException, InterruptedException {
            row.set(new StringBuilder(context.getTaskAttemptID().getTaskID().toString()).reverse().toString());
            visibility = new ColumnVisibility(context.getConfiguration().get(VISIBILITY, ""));
            rowCells = context.getCounter(DataLoadMapper.class.getName(), "row_" + row);
            tables = context.getConfiguration().getStringCollection(OUTPUT_TABLE_NAMES);
        }

    }

    /**
     * given a configurable number of rows, generates records for each one of the form:
     *   Key: one letter A through Z, Value: user specified number of qualifiers
     */
    public static class DataLoadInputFormat extends InputFormat<Text, LongWritable> {
        protected static final String NUM_QUALIFIERS = DataLoadInputFormat.class.getName() + ".num_qualifiers";
        protected static final String NUM_ROWS = DataLoadInputFormat.class.getName() + ".num_rows";
        protected static final String ACTIVE_TSERVERS = DataLoadInputFormat.class.getName() + ".active_tservers";

        @Override
        public RecordReader<Text, LongWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
            return new DataLoadRecordReader();
        }

        @Override
        public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
            final long numCells = getNumQualifiersPerFamily(context.getConfiguration());
            final int numRows = getNumRows(context.getConfiguration());
            final List<String> tservers = getTabletServers(context.getConfiguration());
            ;
            final List<InputSplit> splits = new ArrayList<InputSplit>(numRows);
            if (tservers.size() == 0) {
                log.warn(
                        "Couldn't get set of active tservers on the cluster, map tasks won't have locality information");
                for (int i = 0; i < numRows; i++) {
                    splits.add(new TabletServerLocalInputSplit(numCells, null));
                }
            } else {
                for (int i = 0; i < numRows;) {
                    for (int j = 0; j < tservers.size() && i < numRows; j++, i++) {
                        splits.add(new TabletServerLocalInputSplit(numCells, tservers.get(j)));
                    }
                }
            }
            return splits;
        }

        public static void setTabletServers(Job job, List<String> tabletServers) throws IOException {
            job.getConfiguration().setStrings(ACTIVE_TSERVERS, tabletServers.toArray(new String[0]));
        }

        public static List<String> getTabletServers(Configuration conf) {
            return new ArrayList<String>(conf.getStringCollection(ACTIVE_TSERVERS));
        }

        public static void setNumQualifiersPerFamily(Job job, long num) {
            job.getConfiguration().setLong(NUM_QUALIFIERS, num);
        }

        public static long getNumQualifiersPerFamily(Configuration conf) {
            return conf.getLong(NUM_QUALIFIERS, DataCompatibilityTestCli.DEFAULT_NUM_QUALIFIERS);
        }

        public static void setNumRows(Job job, int num) {
            job.getConfiguration().setLong(NUM_ROWS, num);
        }

        public static int getNumRows(Configuration conf) {
            return conf.getInt(NUM_ROWS, DataCompatibilityTestCli.DEFAULT_NUM_ROWS);
        }

        public static class TabletServerLocalInputSplit extends InputSplit
                implements WritableComparable<TabletServerLocalInputSplit> {
            long numCells;
            String[] locations;

            public TabletServerLocalInputSplit() {
                this(0, null);
            }

            public TabletServerLocalInputSplit(long num, String tabletServer) {
                numCells = num;
                locations = null == tabletServer ? new String[] {} : new String[] { tabletServer };
            }

            @Override
            public long getLength() {
                return numCells;
            }

            @Override
            public String[] getLocations() {
                return locations;
            }

            @Override
            public void write(DataOutput out) throws IOException {
                WritableUtils.writeVLong(out, numCells);
            }

            @Override
            public void readFields(DataInput in) throws IOException {
                numCells = WritableUtils.readVLong(in);
            }

            @Override
            public int compareTo(TabletServerLocalInputSplit other) {
                return (int) (numCells - other.numCells);
            }
        }

        protected static class DataLoadRecordReader extends RecordReader<Text, LongWritable> {
            protected boolean valid;
            protected int current;
            protected final Text key = new Text();
            protected final LongWritable value = new LongWritable();

            @Override
            public void initialize(InputSplit split, TaskAttemptContext context)
                    throws IOException, InterruptedException {
                current = -1;
                valid = false;
                key.clear();
                value.set(getNumQualifiersPerFamily(context.getConfiguration()));
            }

            @Override
            public boolean nextKeyValue() throws IOException, InterruptedException {
                current++;
                valid = current >= 0 && current < FAMILIES.length;
                if (valid) {
                    key.set(FAMILIES[current]);
                }
                return current >= 0 && current < FAMILIES.length;
            }

            @Override
            public Text getCurrentKey() throws IOException, InterruptedException {
                return valid ? key : null;
            }

            @Override
            public LongWritable getCurrentValue() throws IOException, InterruptedException {
                return valid ? value : null;
            }

            @Override
            public float getProgress() throws IOException, InterruptedException {
                return current < 0 ? 0.f : current > FAMILIES.length ? 1.f : (current + 1.0f) / FAMILIES.length;
            }

            @Override
            public void close() throws IOException {
                current = -1;
                valid = false;
            }
        }

    }

    protected enum TableOptions {
        uncompressed_bloom_block(
                new ImmutableMap.Builder<String, String>().put(TABLE_BLOOM_ENABLED.getKey(), Boolean.toString(true))
                        .put(TABLE_FILE_COMPRESSION_TYPE.getKey(), "none")
                        .put(TABLE_BLOCKCACHE_ENABLED.getKey(), Boolean.toString(true)).build()),

        snappy_bloom(
                new ImmutableMap.Builder<String, String>().put(TABLE_BLOOM_ENABLED.getKey(), Boolean.toString(true))
                        .put(TABLE_FILE_COMPRESSION_TYPE.getKey(), "snappy")
                        .put(TABLE_BLOCKCACHE_ENABLED.getKey(), Boolean.toString(false)).build()),

        snappy_block(new ImmutableMap.Builder<String, String>()
                .put(TABLE_BLOOM_ENABLED.getKey(), Boolean.toString(false))
                .put(TABLE_FILE_COMPRESSION_TYPE.getKey(), "snappy")
                .put(TABLE_BLOCKCACHE_ENABLED.getKey(), Boolean.toString(true)).build()),

        gz(new ImmutableMap.Builder<String, String>().put(TABLE_BLOOM_ENABLED.getKey(), Boolean.toString(false))
                .put(TABLE_FILE_COMPRESSION_TYPE.getKey(), "gz")
                .put(TABLE_BLOCKCACHE_ENABLED.getKey(), Boolean.toString(false)).build()),

        lzo(new ImmutableMap.Builder<String, String>().put(TABLE_BLOOM_ENABLED.getKey(), Boolean.toString(false))
                .put(TABLE_FILE_COMPRESSION_TYPE.getKey(), "lzo")
                .put(TABLE_BLOCKCACHE_ENABLED.getKey(), Boolean.toString(false)).build());

        final Map<String, String> properties;

        TableOptions(ImmutableMap<String, String> properties) {
            this.properties = properties;
        }

        public Map<String, String> getProperties() {
            return properties;
        }
    }

    /* TODO add an option to drop tables that exist at the beginning of the run. */
    protected static class DataCompatibilityTestCli {

        @Parameter(names = {
                "--num-rows" }, required = false, description = "number of rows to use on given tables.")
        public int numRows = -1;

        public static final int DEFAULT_NUM_ROWS = 2;
        public static final long DEFAULT_NUM_QUALIFIERS = 300000l;

        @Parameter(names = { "-n",
                "--num-qualifiers" }, required = false, description = "number of column qualifiers per column family. effectively sets number of cells per cf. defaults to 300k.")
        public long qualifiers = DEFAULT_NUM_QUALIFIERS;

        @Parameter(names = { "-t",
                "--tables" }, required = false, description = "comma separated list of pre-existing tables to write to, instead of making our own.")
        public String tables = null;

        @Parameter(names = {
                "--prefix" }, required = false, description = "prefix to use on generated table names. Defaults to 'data_compatibility_test_'.")
        public String prefix = "data_compatibility_test_";

        @Parameter(names = {
                "--lzo" }, required = false, description = "elect to include an LZO-compressed table. Requires Hadoop LZO support.")
        public boolean withLzo = false;

        public List<String> getTableNames(TableOperations ops)
                throws ParameterException, AccumuloException, AccumuloSecurityException {
            return getTableNames(false, ops);
        }

        public List<String> getTableNamesAndConfigureThem(TableOperations ops)
                throws ParameterException, AccumuloException, AccumuloSecurityException {
            return getTableNames(true, ops);
        }

        protected List<String> getTableNames(boolean change, TableOperations ops)
                throws ParameterException, AccumuloException, AccumuloSecurityException {
            List<String> names;
            if (null != tables) {
                names = Arrays.asList(tables.split(","));
                for (String name : names) {
                    if (!ops.exists(name)) {
                        String error = "Table specified that does not exist: " + name;
                        log.error(error);
                        throw new ParameterException(error);
                    }
                }
            } else {
                final TableOptions[] options = TableOptions.values();
                names = new ArrayList<String>(options.length);
                for (TableOptions option : options) {
                    if (!withLzo
                            && "lzo".equals(option.getProperties().get(TABLE_FILE_COMPRESSION_TYPE.getKey()))) {
                        log.debug("Skipping table options '" + option
                                + "' because they use LZO compression. pass --lzo to include it.");
                        continue;
                    }
                    final String name = prefix + option.toString();
                    if (!ops.exists(name)) {
                        if (change) {
                            log.info("Creating table for option set '" + option + "'");
                            try {
                                ops.create(name);
                            } catch (TableExistsException exception) {
                                log.debug("Table created concurrently; skipping.", exception);
                            }
                        } else {
                            String error = "Generated table name doesn't exist: " + name;
                            log.error(error);
                            throw new ParameterException(error);
                        }
                    }
                    if (change) {
                        for (Map.Entry<String, String> property : option.getProperties().entrySet()) {
                            ops.setProperty(name, property.getKey(), property.getValue());
                        }
                    }
                    names.add(name);
                }
            }
            return names;
        }
    }

    protected static class LoadCli extends Cli {
        public final ConnectionCli connection = new ConnectionCli();
        public final VisibilityCli visibility = new VisibilityCli();
        public final DataCompatibilityTestCli test = new DataCompatibilityTestCli();
        public final MapreduceOutputCli output = new MapreduceOutputCli(connection);

        protected void parseArgs(String programName, String[] args, Object... others) {
            super.parseArgs(programName, args, connection, visibility, test, output);
        }
    };

    protected LoadCli options = new LoadCli();

    @Override
    public int run(String[] args) throws Exception {
        final String jobName = this.getClass().getName();
        options.parseArgs(jobName, args);
        final Job job = new Job(getConf(), jobName);

        if (-1 == options.test.numRows) {
            options.test.numRows = job.getConfiguration().getInt("mapred.map.tasks",
                    DataCompatibilityTestCli.DEFAULT_NUM_ROWS);
        }

        job.setJarByClass(this.getClass());

        job.setInputFormatClass(DataLoadInputFormat.class);
        DataLoadInputFormat.setTabletServers(job,
                options.connection.getConnector().instanceOperations().getTabletServers());
        DataLoadInputFormat.setNumRows(job, options.test.numRows);
        DataLoadInputFormat.setNumQualifiersPerFamily(job, options.test.qualifiers);

        job.getConfiguration().set(VISIBILITY, new String(options.visibility.visibility.getExpression(), "UTF-8"));

        final TableOperations ops = options.connection.getConnector().tableOperations();

        final List<String> names = options.test.getTableNamesAndConfigureThem(ops);
        for (String name : names) {
            final int numSplits = ops.getSplits(name, options.test.numRows).size();
            if (options.test.numRows > numSplits) {
                log.info("adding splits to table '" + name + "', to bring it from " + numSplits + " to "
                        + options.test.numRows + ".");
                final SortedSet<Text> splits = new TreeSet<Text>();
                // for cases where we're adding way more splits than there are currently possible servers to handle them, do a pre-pre-split
                //   N.B. If we've just created this table, there will be 0 splits because we'll just have the initial tablet.
                if (0 == numSplits || options.test.numRows / numSplits > 10) {
                    log.info("splitting in two waves due to the number of splits we need to add.");
                    // TODO turtles all the way down.
                    final int prepre = options.test.numRows / (0 == numSplits ? 10 : numSplits * 10);
                    for (int i = 0; i < prepre; i++) {
                        splits.add(new Text(new StringBuilder(Long.toString(i)).reverse().toString()));
                    }
                    ops.addSplits(name, splits);
                    log.debug("delay 30s for splits to get assigned off host.");
                    try {
                        Thread.currentThread().sleep(30 * 1000);
                    } catch (InterruptedException exception) {
                        log.warn("interrupted from sleep early.");
                    }
                    splits.clear();
                }
                for (int i = 0; i < options.test.numRows; i++) {
                    splits.add(new Text(new StringBuilder(Long.toString(i)).reverse().toString()));
                }
                ops.addSplits(name, splits);
            }
        }
        log.debug("delay 30s for splits to get assigned off host.");
        try {
            Thread.currentThread().sleep(30 * 1000);
        } catch (InterruptedException exception) {
            log.warn("interrupted from sleep early.");
        }

        job.getConfiguration().setStrings(OUTPUT_TABLE_NAMES, names.toArray(new String[0]));

        job.setMapperClass(DataLoadMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Mutation.class);

        job.setNumReduceTasks(0);

        log.info("launching map-only job to insert " + options.test.numRows + " rows of "
                + (FAMILIES.length * options.test.qualifiers) + " cells each into each of the tables " + names);
        options.output.useAccumuloOutputFormat(job);

        job.waitForCompletion(true);
        return job.isSuccessful() ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new DataCompatibilityLoad(), args);
        System.exit(res);
    }

}