com.ibm.crail.terasort.ParseTeraOptions.java Source code

Java tutorial

Introduction

Here is the source code for com.ibm.crail.terasort.ParseTeraOptions.java

Source

/*
 * Crail-terasort: An example terasort program for Sprak and crail
 *
 * Author: Animesh Trivedi <atr@zurich.ibm.com>
 *         Jonas Pfefferle <jpf@zurich.ibm.com>
 *
 * Copyright (C) 2016, IBM Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.ibm.crail.terasort;

import org.apache.commons.cli.*;
import org.apache.spark.SparkConf;

import java.io.Serializable;
import java.util.Hashtable;

public class ParseTeraOptions implements Serializable {
    private Options options;
    private String testNames[];
    private int testIndex;
    private String serializer[];
    private int serializerIndex;
    private String inputDir;
    private String outputDir;
    private int bufferSize;
    private long paritionSize;
    private boolean isPartitionSet;
    private boolean syncOutput;
    private Hashtable<String, String> sparkParams;

    ParseTeraOptions() {
        options = new Options();
        sparkParams = new Hashtable<String, String>();

        testNames = new String[] { "loadonly", "loadstore", "loadcount", "loadcountstore", "loadsort",
                "loadsortstore" };
        testIndex = 5; /* loadsortstore is the default */

        serializer = new String[] { "none", "kryo", "byte", "f22" };
        serializerIndex = 0; /* default is none */

        inputDir = null;
        outputDir = null;
        bufferSize = 4096; //1048576;

        paritionSize = -1;
        isPartitionSet = false;
        syncOutput = false;

        options.addOption("h", "help", false, "show help.");
        options.addOption("n", "testname", true,
                "<string> Name of the test valid tests are :\n" + "1. loadOnly: load and counts the input dataset\n"
                        + "2. loadStore: load the input dataset and stores it\n"
                        + "3. loadCount: load, shuffle, and then count the \n" + "   resulting dataset\n"
                        + "4. loadCountStore: load, shuffle, count, and then \n"
                        + "   store the resulting dataset\n" + "5. loadSort: load, shuffle, and then sort on key \n"
                        + "   the resulting dataset\n" + "6. loadSortStore: load, shuffle, sort on key, then \n"
                        + "   store the resulting dataset\n" + "the default is : loadSortStore");
        options.addOption("i", "inputDir", true, "<string> Name of the input directory");
        options.addOption("o", "outputDir", true, "<string> Name of the output directory");
        options.addOption("S", "sync", true,
                "<int> Takes 0 or 1 to pass to the sync call to the output \n" + "FS while writing (default: 0)");
        options.addOption("p", "partitionSize", true, "<long> Partition size, takes k,m,g,t suffixes\n"
                + "(default: input partition size, HDFS 2.6 has 128MB)");
        options.addOption("s", "useSerializer", true,
                "<string> You can use following serializers: \n" + "none: uses the Spark default serializer \n"
                        + "kryo: optimized Kryo for TeraSort \n" + "byte: a simple byte[] serializer \n"
                        + "f22: an optimized crail-specific byte[] serializer\n"
                        + "     f22 requires CrailShuffleNativeRadixSorter for sorting\n");
        options.addOption("O", "options", true, "string,string : Sets properties on the Spark context. The first \n"
                + "string is the key, and the second is the value");
        options.addOption("b", "bufferSize", true, "<int> Buffer size for Kryo (only valid for kryo)");
        //options.addOption("K", "useKryoOptimizations", true, "<int> use kryoOptimizations (NYI)");
    }

    public String showOptions() {
        String str = "\n";
        str += "testName      : " + testNames[testIndex] + " \n";
        str += "inputDir      : " + inputDir + "\n";
        str += "outputDir     : " + outputDir + "\n";
        str += "bufferSize    : " + bufferSize + "\n";
        str += "serializer    : " + serializer[serializerIndex] + "\n";
        str += "partitionSize : "
                + ((isPartitionSet) ? (paritionSize) : ("sizeNotSet, using the default from HDFS")) + "\n";
        str += "sync output   : " + syncOutput + "\n";
        str += "spark options : ";
        if (sparkParams.size() == 0)
            str += " none " + " \n";
        else {
            str += "\n";
            for (String key : sparkParams.keySet()) {
                str += "                 key: " + key + " value: " + sparkParams.get(key) + " \n";
            }
        }
        str += "\n";
        return str;
    }

    private int getMatchingIndex(String[] options, String name) {
        int i;
        for (i = 0; i < options.length; i++)
            if (name.equalsIgnoreCase(options[i])) {
                return i;
            }
        throw new IllegalArgumentException(name + " not found in " + options);
    }

    public void show_help() {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("Main", options);
    }

    public boolean getSyncOutput() {
        return syncOutput;
    }

    public int getBufferSize() {
        return bufferSize;
    }

    public String getInputDir() {
        return inputDir;
    }

    public String getOutputDir() {
        return outputDir;
    }

    public long getParitionSize() {
        return paritionSize;
    }

    public boolean isPartitionSet() {
        return isPartitionSet;
    }

    public boolean isTestLoadOnly() {
        return testIndex == 0;
    }

    public boolean isTestLoadStore() {
        return testIndex == 1;
    }

    public boolean isTestLoadCount() {
        return testIndex == 2;
    }

    public boolean isTestLoadCountStore() {
        return testIndex == 3;
    }

    public boolean isTestLoadSort() {
        return testIndex == 4;
    }

    public boolean isTestLoadSortStore() {
        return testIndex == 5;
    }

    public boolean isSerializerKryo() {
        return serializerIndex == 1;
    }

    public boolean isSerializerByte() {
        return serializerIndex == 2;
    }

    public boolean isSerializerF22() {
        return serializerIndex == 3;
    }

    public void setSparkOptions(SparkConf conf) {
        for (String key : sparkParams.keySet()) {
            System.err.println(" Setting up key: " + key + " value: " + sparkParams.get(key));
            conf.set(key, sparkParams.get(key));
        }
    }

    public long sizeStrToBytes(String str) {
        String lower = str.toLowerCase();
        long val;
        if (lower.endsWith("k")) {
            val = Long.parseLong(lower.substring(0, lower.length() - 1)) * 1000;
        } else if (lower.endsWith("m")) {
            val = Long.parseLong(lower.substring(0, lower.length() - 1)) * 1000 * 1000;
        } else if (lower.endsWith("g")) {
            val = Long.parseLong(lower.substring(0, lower.length() - 1)) * 1000 * 1000 * 1000;
        } else if (lower.endsWith("t")) {
            val = Long.parseLong(lower.substring(0, lower.length() - 1)) * 1000 * 1000 * 1000 * 1000;
        } else {
            // no suffix, so it's just a number in bytes
            val = Long.parseLong(lower);
        }
        return val;
    }

    public void parse(String[] args) {
        CommandLineParser parser = new GnuParser();
        CommandLine cmd = null;
        int ioset = 0;
        try {
            cmd = parser.parse(options, args);

            if (cmd.hasOption("h")) {
                show_help();
                System.exit(0);
            }
            if (cmd.hasOption("n")) {
                this.testIndex = getMatchingIndex(testNames, cmd.getOptionValue("n").trim());
            }
            if (cmd.hasOption("i")) {
                inputDir = cmd.getOptionValue("i");
                ioset++;
            }
            if (cmd.hasOption("o")) {
                outputDir = cmd.getOptionValue("o");
                ioset++;
            }
            if (cmd.hasOption("s")) {
                this.serializerIndex = getMatchingIndex(serializer, cmd.getOptionValue("s").trim());
            }
            if (cmd.hasOption("b")) {
                bufferSize = Integer.parseInt((cmd.getOptionValue("b")));
            }
            if (cmd.hasOption("p")) {
                paritionSize = sizeStrToBytes(cmd.getOptionValue("p"));
                isPartitionSet = true;
            }
            if (cmd.hasOption("S")) {
                if (Integer.parseInt(cmd.getOptionValue("S")) == 0)
                    syncOutput = false;
                else
                    syncOutput = true;
            }
            if (cmd.hasOption("O")) {
                String[] vals = cmd.getOptionValue("O").split(",");
                if (vals.length != 2) {
                    System.err.println("Failed to parse " + cmd.getOptionValue("O"));
                    System.exit(-1);
                }
                /* otherwise we got stuff */
                sparkParams.put(vals[0].trim(), vals[1].trim());
            }
        } catch (ParseException e) {
            System.err.println("Failed to parse command line properties" + e);
            show_help();
            System.exit(-1);
        }
        if (ioset != 2) {
            System.err.println(" Please set input and output directories atleast ! ");
            show_help();
            System.exit(-1);
        }
    }
}