com.splout.db.benchmark.BenchmarkStoreTool.java Source code

Introduction

Here is the source code for com.splout.db.benchmark.BenchmarkStoreTool.java
Source

package com.splout.db.benchmark;

/*
 * #%L
 * Splout SQL Hadoop library
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Schema.Field.Type;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.MapOnlyJobBuilder;
import com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper;
import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat;
import com.datasalt.pangool.utils.HadoopUtils;
import com.splout.db.common.JSONSerDe;
import com.splout.db.common.PartitionEntry;
import com.splout.db.common.PartitionMap;
import com.splout.db.common.SploutHadoopConfiguration;
import com.splout.db.hadoop.TableSpec;
import com.splout.db.hadoop.engine.SQLite4JavaOutputFormat;
import com.splout.db.hadoop.engine.SploutSQLOutputFormat;
import com.splout.db.hadoop.engine.SploutSQLProxyOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

/**
 * Distributed map-only job that creates an arbitrarily big database for being used by {@link BenchmarkTool}. It doesn't
 * deploy it.
 */
@SuppressWarnings("serial")
public class BenchmarkStoreTool implements Tool, Serializable {

    @Parameter(required = true, names = { "-o",
            "--output" }, description = "Output path where store will be generated. It must be accessible for all Splout nodes for the deploy to succeed.")
    private String output;

    @Parameter(names = { "-t",
            "--tablename" }, description = "Name of the table that will be used for inserting data.")
    private String tablename = "splout_benchmark";

    @Parameter(required = true, names = { "-k",
            "--keyspace" }, description = "A representation of a key space used for generating the data store. Format is minKey:maxKey where both are considered integers. A spec of 0:10 means keys range from 0 (minimum value) to 10 (maximum value).")
    private String keySpace;

    @Parameter(required = true, names = { "-p",
            "--partitions" }, description = "The number of partitions to create for the tablespace.")
    private Integer nPartitions;

    @Parameter(names = { "-s",
            "--valuesize" }, description = "The value size in bytes that will be associated with each key.")
    private Integer valueSize = 1024;

    @Parameter(names = { "-pad",
            "--padding" }, description = "The padding size to use for normalizing the integer keys to strings. With padding 3, 1 gets 001. This is needed for benchmark keys. By default, padding is autoadjusted to the size of the maxKey of the key range.")
    private Long padding;

    private transient Configuration conf;

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    @Override
    public int run(String[] args) throws Exception {
        JCommander jComm = new JCommander(this);
        jComm.setProgramName("Benchmark-Store Tool");
        try {
            jComm.parse(args);
        } catch (ParameterException e) {
            System.out.println(e.getMessage());
            jComm.usage();
            return -1;
        } catch (Throwable t) {
            t.printStackTrace();
            jComm.usage();
            return -1;
        }

        // Create some input files that will represent the partitions to generate
        Path out = new Path(output);
        FileSystem outFs = out.getFileSystem(getConf());
        HadoopUtils.deleteIfExists(outFs, out);

        Integer min, max, eachPartition;
        int maxKeyDigits;

        try {
            String[] minMax = keySpace.split(":");
            min = Integer.parseInt(minMax[0]);
            max = Integer.parseInt(minMax[1]);
            maxKeyDigits = max.toString().length();

            eachPartition = (max - min) / nPartitions;
        } catch (Exception e) {
            throw new IllegalArgumentException(
                    "Key range format is not valid. It must be minKey:maxKey where both minKey and maxKey are integers.");
        }

        FileSystem inFs = FileSystem.get(getConf());
        Path input = new Path("benchmark-store-tool-" + System.currentTimeMillis());
        HadoopUtils.deleteIfExists(inFs, input);
        inFs.mkdirs(input);

        List<PartitionEntry> partitionEntries = new ArrayList<PartitionEntry>();

        // Create as many input files as partitions
        // Each input file will have as value the range that each Mapper will write
        String paddingExp = "%0" + (padding != null ? padding : maxKeyDigits) + "d";
        for (int i = 0; i < nPartitions; i++) {
            int thisMin = (i * eachPartition);
            int thisMax = (i + 1) * eachPartition;
            HadoopUtils.stringToFile(inFs, new Path(input, i + ".txt"), i + "\t" + thisMin + ":" + thisMax);
            PartitionEntry entry = new PartitionEntry();
            entry.setMin(String.format(paddingExp, thisMin));
            entry.setMax(String.format(paddingExp, thisMax));
            entry.setShard(i);
            partitionEntries.add(entry);
        }

        partitionEntries.get(0).setMin(null);
        partitionEntries.get(partitionEntries.size() - 1).setMax(null);

        PartitionMap partitionMap = new PartitionMap(partitionEntries);
        HadoopUtils.stringToFile(outFs, new Path(out, "partition-map"), JSONSerDe.ser(partitionMap));

        List<Field> fields = new ArrayList<Field>();
        fields.add(Field.create(SploutSQLOutputFormat.PARTITION_TUPLE_FIELD, Type.INT));
        fields.addAll(Fields.parse("key:int, value:string"));
        final Schema schema = new Schema(tablename, fields);

        byte[] valueArray = new byte[valueSize];
        for (int i = 0; i < valueSize; i++) {
            valueArray[i] = 'A';
        }
        final String theValue = new String(valueArray);

        if (!FileSystem.getLocal(conf).equals(FileSystem.get(conf))) {
            File nativeLibs = new File("native");
            if (nativeLibs.exists()) {
                SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf);
            }
        }

        MapOnlyJobBuilder job = new MapOnlyJobBuilder(conf);
        TableSpec tableSpec = new TableSpec(schema, schema.getFields().get(1));

        job.setOutput(new Path(out, "store"),
                new SploutSQLProxyOutputFormat(new SQLite4JavaOutputFormat(1000000, tableSpec)), ITuple.class,
                NullWritable.class);
        job.addInput(input, new HadoopInputFormat(TextInputFormat.class),
                new MapOnlyMapper<LongWritable, Text, ITuple, NullWritable>() {

                    ITuple metaTuple = new Tuple(schema);

                    protected void map(LongWritable key, Text value, Context context)
                            throws IOException, InterruptedException {

                        String[] partitionRange = value.toString().split("\t");
                        Integer partition = Integer.parseInt(partitionRange[0]);
                        metaTuple.set(SploutSQLOutputFormat.PARTITION_TUPLE_FIELD, partition);
                        String[] minMax = partitionRange[1].split(":");
                        Integer min = Integer.parseInt(minMax[0]);
                        Integer max = Integer.parseInt(minMax[1]);
                        for (int i = min; i < max; i++) {
                            metaTuple.set("key", i);
                            metaTuple.set("value", theValue);
                            context.write(metaTuple, NullWritable.get());
                        }
                    }
                });

        job.createJob().waitForCompletion(true);

        HadoopUtils.deleteIfExists(inFs, input);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new BenchmarkStoreTool(), args);
    }
}