org.apache.giraph.hive.HiveGiraphRunner.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.giraph.hive.HiveGiraphRunner.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.giraph.hive;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.giraph.conf.GiraphClasses;
import org.apache.giraph.conf.GiraphConfiguration;
import org.apache.giraph.graph.Vertex;
import org.apache.giraph.hive.input.edge.HiveEdgeInputFormat;
import org.apache.giraph.hive.input.edge.HiveToEdge;
import org.apache.giraph.hive.input.vertex.HiveToVertex;
import org.apache.giraph.hive.input.vertex.HiveVertexInputFormat;
import org.apache.giraph.hive.output.HiveVertexOutputFormat;
import org.apache.giraph.hive.output.HiveVertexWriter;
import org.apache.giraph.hive.output.VertexToHive;
import org.apache.giraph.job.GiraphJob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.apache.thrift.TException;

import com.facebook.giraph.hive.input.HiveApiInputFormat;
import com.facebook.giraph.hive.input.HiveInputDescription;
import com.facebook.giraph.hive.output.HiveApiOutputFormat;
import com.facebook.giraph.hive.output.HiveOutputDescription;
import com.facebook.giraph.hive.schema.HiveTableSchemas;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_EDGE_SPLITS;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_TO_EDGE_CLASS;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_TO_VERTEX_CLASS;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_VERTEX_SPLITS;
import static org.apache.giraph.hive.common.HiveProfiles.EDGE_INPUT_PROFILE_ID;
import static org.apache.giraph.hive.common.HiveProfiles.VERTEX_INPUT_PROFILE_ID;
import static org.apache.giraph.hive.common.HiveProfiles.VERTEX_OUTPUT_PROFILE_ID;

/**
 * Hive Giraph Runner
 */
public class HiveGiraphRunner implements Tool {
    /** logger */
    private static final Logger LOG = Logger.getLogger(HiveGiraphRunner.class);
    /** Prefix for log statements */
    private static final String LOG_PREFIX = "\t";

    /** workers */
    protected int workers;
    /** is verbose */
    protected boolean isVerbose;

    /** vertex class. */
    private Class<? extends Vertex> vertexClass;

    /** Vertex creator from hive records. */
    private Class<? extends HiveToVertex> hiveToVertexClass;
    /** hive vertex input information */
    private final HiveInputDescription hiveVertexInputDescription;

    /** Edge creator from hive records. */
    private Class<? extends HiveToEdge> hiveToEdgeClass;
    /** hive edge input information */
    private final HiveInputDescription hiveEdgeInputDescription;

    /** Hive Vertex writer */
    private Class<? extends VertexToHive> vertexToHiveClass;
    /** hive output information */
    private final HiveOutputDescription hiveOutputDescription;
    /** Skip output? (Useful for testing without writing) */
    private boolean skipOutput = false;

    /** Configuration */
    private Configuration conf;

    /** Create a new runner */
    public HiveGiraphRunner() {
        conf = new HiveConf(getClass());
        hiveVertexInputDescription = new HiveInputDescription();
        hiveEdgeInputDescription = new HiveInputDescription();
        hiveOutputDescription = new HiveOutputDescription();
    }

    public Class<? extends Vertex> getVertexClass() {
        return vertexClass;
    }

    public void setVertexClass(Class<? extends Vertex> vertexClass) {
        this.vertexClass = vertexClass;
    }

    public HiveInputDescription getHiveVertexInputDescription() {
        return hiveVertexInputDescription;
    }

    public HiveOutputDescription getHiveOutputDescription() {
        return hiveOutputDescription;
    }

    public HiveInputDescription getHiveEdgeInputDescription() {
        return hiveEdgeInputDescription;
    }

    public Class<? extends HiveToVertex> getHiveToVertexClass() {
        return hiveToVertexClass;
    }

    /**
     * Set HiveToVertex used with HiveVertexInputFormat
     *
     * @param hiveToVertexClass HiveToVertex
     */
    public void setHiveToVertexClass(Class<? extends HiveToVertex> hiveToVertexClass) {
        this.hiveToVertexClass = hiveToVertexClass;
        HIVE_TO_VERTEX_CLASS.set(conf, hiveToVertexClass);
    }

    /**
     * Whether to use vertex input.
     *
     * @return true if vertex input enabled (HiveToVertex is set).
     */
    public boolean hasVertexValueInput() {
        return hiveToVertexClass != null;
    }

    public Class<? extends HiveToEdge> getHiveToEdgeClass() {
        return hiveToEdgeClass;
    }

    /**
     * Whether to use edge input.
     *
     * @return true if edge input enabled (HiveToEdge is set).
     */
    public boolean hasEdgeInput() {
        return hiveToEdgeClass != null;
    }

    /**
     * Set HiveToEdge used with HiveEdgeInputFormat
     *
     * @param hiveToEdgeClass HiveToEdge
     */
    public void setHiveToEdgeClass(Class<? extends HiveToEdge> hiveToEdgeClass) {
        this.hiveToEdgeClass = hiveToEdgeClass;
        HIVE_TO_EDGE_CLASS.set(conf, hiveToEdgeClass);
    }

    public Class<? extends VertexToHive> getVertexToHiveClass() {
        return vertexToHiveClass;
    }

    /**
     * Whether we are writing vertices out.
     *
     * @return true if vertex output enabled
     */
    public boolean hasVertexOutput() {
        return !skipOutput && vertexToHiveClass != null;
    }

    /**
     * Set class used to write vertices to Hive.
     *
     * @param vertexToHiveClass class for writing vertices to Hive.
     */
    public void setVertexToHiveClass(Class<? extends VertexToHive> vertexToHiveClass) {
        this.vertexToHiveClass = vertexToHiveClass;
        conf.setClass(HiveVertexWriter.VERTEX_TO_HIVE_KEY, vertexToHiveClass, VertexToHive.class);
    }

    /**
     * main method
     * @param args system arguments
     * @throws Exception any errors from Hive Giraph Runner
     */
    public static void main(String[] args) throws Exception {
        HiveGiraphRunner runner = new HiveGiraphRunner();
        System.exit(ToolRunner.run(runner, args));
    }

    @Override
    public final int run(String[] args) throws Exception {
        // process args
        try {
            handleCommandLine(args);
        } catch (InterruptedException e) {
            return 0;
        } catch (IllegalArgumentException e) {
            System.err.println(e.getMessage());
            return -1;
        }

        // additional configuration for Hive
        adjustConfigurationForHive();

        // setup GiraphJob
        GiraphJob job = new GiraphJob(getConf(), getClass().getName());
        GiraphConfiguration giraphConf = job.getConfiguration();
        giraphConf.setVertexClass(vertexClass);

        setupHiveInputs(giraphConf);
        setupHiveOutput(giraphConf);

        giraphConf.setWorkerConfiguration(workers, workers, 100.0f);
        initGiraphJob(job);

        logOptions(giraphConf);

        return job.run(isVerbose) ? 0 : -1;
    }

    /**
     * Initialize hive input settings
     *
     * @param conf Configuration to write to
     * @throws TException thrift problem
     */
    private void setupHiveInputs(GiraphConfiguration conf) throws TException {
        if (hiveToVertexClass != null) {
            hiveVertexInputDescription.setNumSplits(HIVE_VERTEX_SPLITS.get(conf));
            HiveApiInputFormat.setProfileInputDesc(conf, hiveVertexInputDescription, VERTEX_INPUT_PROFILE_ID);
            conf.setVertexInputFormatClass(HiveVertexInputFormat.class);
            HiveTableSchemas.put(conf, VERTEX_INPUT_PROFILE_ID, hiveVertexInputDescription.hiveTableName());
        }

        if (hiveToEdgeClass != null) {
            hiveEdgeInputDescription.setNumSplits(HIVE_EDGE_SPLITS.get(conf));
            HiveApiInputFormat.setProfileInputDesc(conf, hiveEdgeInputDescription, EDGE_INPUT_PROFILE_ID);
            conf.setEdgeInputFormatClass(HiveEdgeInputFormat.class);
            HiveTableSchemas.put(conf, EDGE_INPUT_PROFILE_ID, hiveEdgeInputDescription.hiveTableName());
        }
    }

    /**
     * Initialize hive output settings
     *
     * @param conf Configuration to write to
     * @throws TException thrift problem
     */
    private void setupHiveOutput(GiraphConfiguration conf) throws TException {
        if (skipOutput) {
            LOG.warn("run: Warning - Output will be skipped!");
        } else if (vertexToHiveClass != null) {
            HiveApiOutputFormat.initProfile(conf, hiveOutputDescription, VERTEX_OUTPUT_PROFILE_ID);
            conf.setVertexOutputFormatClass(HiveVertexOutputFormat.class);
            HiveTableSchemas.put(conf, VERTEX_OUTPUT_PROFILE_ID, hiveOutputDescription.hiveTableName());
        } else {
            LOG.fatal("output requested but " + VertexToHive.class.getSimpleName() + " not set");
        }
    }

    /**
    * set hive configuration
    */
    private void adjustConfigurationForHive() {
        // when output partitions are used, workers register them to the
        // metastore at cleanup stage, and on HiveConf's initialization, it
        // looks for hive-site.xml.
        addToStringCollection(conf, "tmpfiles", conf.getClassLoader().getResource("hive-site.xml").toString());

        // Or, more effectively, we can provide all the jars client needed to
        // the workers as well
        String[] hadoopJars = System.getenv("HADOOP_CLASSPATH").split(File.pathSeparator);
        List<String> hadoopJarURLs = Lists.newArrayList();
        for (String jarPath : hadoopJars) {
            File file = new File(jarPath);
            if (file.exists() && file.isFile()) {
                String jarURL = file.toURI().toString();
                hadoopJarURLs.add(jarURL);
            }
        }
        addToStringCollection(conf, "tmpjars", hadoopJarURLs);
    }

    /**
    * process arguments
    * @param args to process
    * @return CommandLine instance
    * @throws org.apache.commons.cli.ParseException error parsing arguments
    * @throws InterruptedException interrupted
    */
    private CommandLine handleCommandLine(String[] args) throws ParseException, InterruptedException {
        Options options = new Options();
        addOptions(options);
        addMoreOptions(options);

        CommandLineParser parser = new GnuParser();
        final CommandLine cmdln = parser.parse(options, args);
        if (args.length == 0 || cmdln.hasOption("help")) {
            new HelpFormatter().printHelp(getClass().getName(), options, true);
            throw new InterruptedException();
        }

        // Giraph classes
        String vertexClassStr = cmdln.getOptionValue("vertexClass");
        if (vertexClassStr != null) {
            vertexClass = findClass(vertexClassStr, Vertex.class);
        }
        if (vertexClass == null) {
            throw new IllegalArgumentException(
                    "Need the Giraph " + Vertex.class.getSimpleName() + " class name (-vertexClass) to use");
        }

        String hiveToVertexClassStr = cmdln.getOptionValue("hiveToVertexClass");
        if (hiveToVertexClassStr != null) {
            if (hiveToVertexClassStr.equals("disable")) {
                hiveToVertexClass = null;
            } else {
                setHiveToVertexClass(findClass(hiveToVertexClassStr, HiveToVertex.class));
            }
        }

        String hiveToEdgeClassStr = cmdln.getOptionValue("hiveToEdgeClass");
        if (hiveToEdgeClassStr != null) {
            if (hiveToEdgeClassStr.equals("disable")) {
                hiveToEdgeClass = null;
            } else {
                setHiveToEdgeClass(findClass(hiveToEdgeClassStr, HiveToEdge.class));
            }
        }

        String vertexToHiveClassStr = cmdln.getOptionValue("vertexToHiveClass");
        if (vertexToHiveClassStr != null) {
            setVertexToHiveClass(findClass(vertexToHiveClassStr, VertexToHive.class));
        }

        if (cmdln.hasOption("skipOutput")) {
            skipOutput = true;
        }

        if (hiveToVertexClass == null && hiveToEdgeClass == null) {
            throw new IllegalArgumentException("Need at least one of Giraph " + HiveToVertex.class.getSimpleName()
                    + " class name (-hiveToVertexClass) and " + HiveToEdge.class.getSimpleName()
                    + " class name (-hiveToEdgeClass)");
        }
        if (vertexToHiveClass == null && !skipOutput) {
            throw new IllegalArgumentException("Need the Giraph " + VertexToHive.class.getSimpleName()
                    + " class name (-vertexToHiveClass) to use");
        }
        String workersStr = cmdln.getOptionValue("workers");
        if (workersStr == null) {
            throw new IllegalArgumentException("Need to choose the number of workers (-w)");
        }

        String vertexInputTableStr = cmdln.getOptionValue("vertexInputTable");
        if (vertexInputTableStr == null && hiveToVertexClass != null) {
            throw new IllegalArgumentException("Need to set the vertex input table name (-vi)");
        }

        String edgeInputTableStr = cmdln.getOptionValue("edgeInputTable");
        if (edgeInputTableStr == null && hiveToEdgeClass != null) {
            throw new IllegalArgumentException("Need to set the edge input table name (-ei)");
        }

        String outputTableStr = cmdln.getOptionValue("outputTable");
        if (outputTableStr == null) {
            throw new IllegalArgumentException("Need to set the output table name (-o)");
        }

        String dbName = cmdln.getOptionValue("dbName", "default");
        hiveVertexInputDescription.setDbName(dbName);
        hiveEdgeInputDescription.setDbName(dbName);
        hiveOutputDescription.setDbName(dbName);

        hiveEdgeInputDescription.setPartitionFilter(cmdln.getOptionValue("edgeInputFilter"));
        hiveEdgeInputDescription.setTableName(edgeInputTableStr);

        hiveVertexInputDescription.setPartitionFilter(cmdln.getOptionValue("vertexInputFilter"));
        hiveVertexInputDescription.setTableName(vertexInputTableStr);

        hiveOutputDescription.setTableName(cmdln.getOptionValue("outputTable"));
        hiveOutputDescription.setPartitionValues(parsePartitionValues(cmdln.getOptionValue("outputPartition")));

        workers = Integer.parseInt(workersStr);

        isVerbose = cmdln.hasOption("verbose");

        // pick up -hiveconf arguments
        processHiveConfOptions(cmdln);

        processMoreArguments(cmdln);

        return cmdln;
    }

    /**
     * Process -hiveconf options from command line
     *
     * @param cmdln Command line options
     */
    private void processHiveConfOptions(CommandLine cmdln) {
        for (String hiveconf : cmdln.getOptionValues("hiveconf")) {
            String[] keyval = hiveconf.split("=", 2);
            if (keyval.length == 2) {
                String name = keyval[0];
                String value = keyval[1];
                if (name.equals("tmpjars") || name.equals("tmpfiles")) {
                    addToStringCollection(conf, name, value);
                } else {
                    conf.set(name, value);
                }
            }
        }
    }

    /**
     * @param outputTablePartitionString table partition string
     * @return Map
     */
    public static Map<String, String> parsePartitionValues(String outputTablePartitionString) {
        if (outputTablePartitionString == null) {
            return null;
        }
        Splitter commaSplitter = Splitter.on(',').omitEmptyStrings().trimResults();
        Splitter equalSplitter = Splitter.on('=').omitEmptyStrings().trimResults();
        Map<String, String> partitionValues = Maps.newHashMap();
        for (String keyValStr : commaSplitter.split(outputTablePartitionString)) {
            List<String> keyVal = Lists.newArrayList(equalSplitter.split(keyValStr));
            if (keyVal.size() != 2) {
                throw new IllegalArgumentException(
                        "Unrecognized partition value format: " + outputTablePartitionString);
            }
            partitionValues.put(keyVal.get(0), keyVal.get(1));
        }
        return partitionValues;
    }

    /**
     * Add hive-related options to command line parser options
     *
     * @param options Options to use
     */
    private void addOptions(Options options) {
        options.addOption("h", "help", false, "Help");
        options.addOption("v", "verbose", false, "Verbose");
        options.addOption("D", "hiveconf", true, "property=value for Hive/Hadoop configuration");
        options.addOption("w", "workers", true, "Number of workers");

        if (vertexClass == null) {
            options.addOption(null, "vertexClass", true, "Giraph Vertex class to use");
        }

        options.addOption("db", "dbName", true, "Hive database name");

        // Vertex input settings
        options.addOption(null, "hiveToVertexClass", true,
                "Giraph " + HiveToVertex.class.getSimpleName() + " class to use (default - "
                        + (hiveToVertexClass == null ? "not used" : hiveToVertexClass.getSimpleName()) + "), "
                        + "\"disable\" will unset this option");
        options.addOption("vi", "vertexInputTable", true, "Vertex input table name");
        options.addOption("VI", "vertexInputFilter", true,
                "Vertex input table filter expression (e.g., \"a<2 AND b='two'\"");

        // Edge input settings
        options.addOption(null, "hiveToEdgeClass", true,
                "Giraph " + HiveToEdge.class.getSimpleName() + " class to use (default - "
                        + (hiveToEdgeClass == null ? "not used" : hiveToEdgeClass.getSimpleName()) + "), "
                        + "\"disable\" will unset this option");
        options.addOption("ei", "edgeInputTable", true, "Edge input table name");
        options.addOption("EI", "edgeInputFilter", true,
                "Edge input table filter expression (e.g., \"a<2 AND b='two'\"");

        // Vertex output settings
        if (vertexToHiveClass == null) {
            options.addOption(null, "vertexToHiveClass", true,
                    "Giraph " + VertexToHive.class.getSimpleName() + " class to use");
        }

        options.addOption("o", "outputTable", true, "Output table name");
        options.addOption("O", "outputPartition", true, "Output table partition values (e.g., \"a=1,b=two\")");
        options.addOption("s", "skipOutput", false, "Skip output?");
    }

    /**
    * add string to collection
    * @param conf Configuration
    * @param name name to add
    * @param values values for collection
    */
    private static void addToStringCollection(Configuration conf, String name, String... values) {
        addToStringCollection(conf, name, Arrays.asList(values));
    }

    /**
    * add string to collection
    * @param conf Configuration
    * @param name to add
    * @param values values for collection
    */
    private static void addToStringCollection(Configuration conf, String name,
            Collection<? extends String> values) {
        Collection<String> tmpfiles = conf.getStringCollection(name);
        tmpfiles.addAll(values);
        conf.setStrings(name, tmpfiles.toArray(new String[tmpfiles.size()]));
    }

    /**
    *
    * @param className to find
    * @param base  base class
    * @param <T> class type found
    * @return type found
    */
    private <T> Class<? extends T> findClass(String className, Class<T> base) {
        try {
            Class<?> cls = Class.forName(className);
            if (base.isAssignableFrom(cls)) {
                return cls.asSubclass(base);
            }
            return null;
        } catch (ClassNotFoundException e) {
            throw new IllegalArgumentException(className + ": Invalid class name");
        }
    }

    @Override
    public final Configuration getConf() {
        return conf;
    }

    @Override
    public final void setConf(Configuration conf) {
        this.conf = conf;
    }

    /**
    * Override this method to add more command-line options. You can process
    * them by also overriding {@link #processMoreArguments(CommandLine)}.
    *
    * @param options Options
    */
    protected void addMoreOptions(Options options) {
    }

    /**
    * Override this method to process additional command-line arguments. You
    * may want to declare additional options by also overriding
    * {@link #addMoreOptions(org.apache.commons.cli.Options)}.
    *
    * @param cmd Command
    */
    protected void processMoreArguments(CommandLine cmd) {
    }

    /**
    * Override this method to do additional setup with the GiraphJob that will
    * run.
    *
    * @param job GiraphJob that is going to run
    */
    protected void initGiraphJob(GiraphJob job) {
    }

    /**
     * Log the options set by user
     *
     * @param giraphConf GiraphConfiguration
     */
    private void logOptions(GiraphConfiguration giraphConf) {
        GiraphClasses classes = new GiraphClasses(giraphConf);

        LOG.info(getClass().getSimpleName() + " with");

        LOG.info(LOG_PREFIX + "-vertexClass=" + vertexClass.getCanonicalName());

        if (hiveToVertexClass != null) {
            LOG.info(LOG_PREFIX + "-hiveToVertexClass=" + hiveToVertexClass.getCanonicalName());
        }
        if (classes.getVertexInputFormatClass() != null) {
            LOG.info(LOG_PREFIX + "-vertexInputFormatClass="
                    + classes.getVertexInputFormatClass().getCanonicalName());
            logInputDesc(hiveVertexInputDescription, "vertex");
        }

        if (hiveToEdgeClass != null) {
            LOG.info(LOG_PREFIX + "-hiveToEdgeClass=" + hiveToEdgeClass.getCanonicalName());
        }
        if (classes.getEdgeInputFormatClass() != null) {
            LOG.info(LOG_PREFIX + "-edgeInputFormatClass=" + classes.getEdgeInputFormatClass().getCanonicalName());
            logInputDesc(hiveEdgeInputDescription, "edge");
        }

        LOG.info(LOG_PREFIX + "-outputTable=" + hiveOutputDescription.getTableName());
        if (hiveOutputDescription.hasPartitionValues()) {
            LOG.info(LOG_PREFIX + "-outputPartition=\"" + hiveOutputDescription.getPartitionValues() + "\"");
        }
        if (classes.getVertexOutputFormatClass() != null) {
            LOG.info(LOG_PREFIX + "-outputFormatClass=" + classes.getVertexOutputFormatClass().getCanonicalName());
        }

        LOG.info(LOG_PREFIX + "-workers=" + workers);
    }

    /**
     * Helper to log input description with a name
     *
     * @param inputDesc input description to log
     * @param name String prefix name
     */
    private void logInputDesc(HiveInputDescription inputDesc, String name) {
        if (inputDesc.hasTableName()) {
            LOG.info(LOG_PREFIX + "-" + name + "InputTable=" + inputDesc.getTableName());
        }
        if (inputDesc.hasPartitionFilter()) {
            LOG.info(LOG_PREFIX + "-" + name + "InputFilter=\"" + inputDesc.getPartitionFilter() + "\"");
        }
    }
}