org.apache.giraph.hive.common.HiveUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.giraph.hive.common.HiveUtils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.giraph.hive.common;

import org.apache.giraph.conf.ImmutableClassesGiraphConfiguration;
import org.apache.giraph.conf.StrConfOption;
import org.apache.giraph.hive.input.mapping.HiveToMapping;
import org.apache.giraph.hive.input.edge.HiveToEdge;
import org.apache.giraph.hive.input.vertex.HiveToVertex;
import org.apache.giraph.hive.output.VertexToHive;
import org.apache.giraph.utils.ReflectionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.log4j.Logger;

import com.facebook.hiveio.schema.HiveTableSchema;
import com.facebook.hiveio.schema.HiveTableSchemas;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import static java.lang.System.getenv;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_EDGE_INPUT;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_MAPPING_INPUT;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_VERTEX_INPUT;
import static org.apache.giraph.hive.common.GiraphHiveConstants.VERTEX_TO_HIVE_CLASS;

/**
 * Utility methods for Hive IO
 */
@SuppressWarnings("unchecked")
public class HiveUtils {
    /** Logger */
    private static final Logger LOG = Logger.getLogger(HiveUtils.class);

    /** Do not instantiate */
    private HiveUtils() {
    }

    /**
     * @param outputTablePartitionString table partition string
     * @return Map
     */
    public static Map<String, String> parsePartitionValues(String outputTablePartitionString) {
        if (outputTablePartitionString == null) {
            return null;
        }
        Splitter commaSplitter = Splitter.on(',').omitEmptyStrings().trimResults();
        Splitter equalSplitter = Splitter.on('=').omitEmptyStrings().trimResults();
        Map<String, String> partitionValues = Maps.newHashMap();
        for (String keyValStr : commaSplitter.split(outputTablePartitionString)) {
            List<String> keyVal = Lists.newArrayList(equalSplitter.split(keyValStr));
            if (keyVal.size() != 2) {
                throw new IllegalArgumentException(
                        "Unrecognized partition value format: " + outputTablePartitionString);
            }
            partitionValues.put(keyVal.get(0), keyVal.get(1));
        }
        return partitionValues;
    }

    /**
     * Lookup index of column in {@link HiveTableSchema}, or throw if not found.
     *
     * @param schema {@link HiveTableSchema}
     * @param columnName column name
     * @return column index
     */
    public static int columnIndexOrThrow(HiveTableSchema schema, String columnName) {
        int index = schema.positionOf(columnName);
        if (index == -1) {
            throw new IllegalArgumentException(
                    "Column " + columnName + " not found in table " + schema.getTableDesc());
        }
        return index;
    }

    /**
     * Lookup index of column in {@link HiveTableSchema}, or throw if not found.
     *
     * @param schema {@link HiveTableSchema}
     * @param conf {@link Configuration}
     * @param confOption {@link StrConfOption}
     * @return column index
     */
    public static int columnIndexOrThrow(HiveTableSchema schema, Configuration conf, StrConfOption confOption) {
        String columnName = confOption.get(conf);
        if (columnName == null) {
            throw new IllegalArgumentException("Column " + confOption.getKey() + " not set in configuration");
        }
        return columnIndexOrThrow(schema, columnName);
    }

    /**
     * Add hive-site.xml file to tmpfiles in Configuration.
     *
     * @param conf Configuration
     */
    public static void addHiveSiteXmlToTmpFiles(Configuration conf) {
        // When output partitions are used, workers register them to the
        // metastore at cleanup stage, and on HiveConf's initialization, it
        // looks for hive-site.xml.
        addToHiveFromClassLoader(conf, "hive-site.xml");
    }

    /**
     * Add hive-site-custom.xml to tmpfiles in Configuration.
     *
     * @param conf Configuration
     */
    public static void addHiveSiteCustomXmlToTmpFiles(Configuration conf) {
        addToHiveFromClassLoader(conf, "hive-site-custom.xml");
        addToHiveFromEnv(conf, "HIVE_HOME", "conf/hive-site.xml");
    }

    /**
     * Add a file to Configuration tmpfiles from environment variable
     *
     * @param conf Configuration
     * @param envKey environment variable key
     * @param path search path
     * @return true if file found and added, false otherwise
     */
    private static boolean addToHiveFromEnv(Configuration conf, String envKey, String path) {
        String envValue = getenv(envKey);
        if (envValue == null) {
            return false;
        }
        File file = new File(envValue, path);
        if (file.exists()) {
            LOG.info("addToHiveFromEnv: Adding " + file.getPath() + " to Configuration tmpfiles");
        }
        try {
            addToStringCollection(conf, "tmpfiles", file.toURI().toURL().toString());
        } catch (MalformedURLException e) {
            LOG.error("Failed to get URL for file " + file);
        }
        return true;
    }

    /**
     * Add a file to Configuration tmpfiles from ClassLoader resource
     *
     * @param conf Configuration
     * @param name file name
     * @return true if file found in class loader, false otherwise
     */
    private static boolean addToHiveFromClassLoader(Configuration conf, String name) {
        URL url = conf.getClassLoader().getResource(name);
        if (url == null) {
            return false;
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("addToHiveFromClassLoader: Adding " + name + " at " + url + " to Configuration tmpfiles");
        }
        addToStringCollection(conf, "tmpfiles", url.toString());
        return true;
    }

    /**
     * Add jars from HADOOP_CLASSPATH environment variable to tmpjars property
     * in Configuration.
     *
     * @param conf Configuration
     */
    public static void addHadoopClasspathToTmpJars(Configuration conf) {
        // Or, more effectively, we can provide all the jars client needed to
        // the workers as well
        String hadoopClasspath = getenv("HADOOP_CLASSPATH");
        if (hadoopClasspath == null) {
            return;
        }
        String[] hadoopJars = hadoopClasspath.split(File.pathSeparator);
        if (hadoopJars.length > 0) {
            List<String> hadoopJarURLs = Lists.newArrayList();
            for (String jarPath : hadoopJars) {
                File file = new File(jarPath);
                if (file.exists() && file.isFile()) {
                    hadoopJarURLs.add(file.toURI().toString());
                }
            }
            HiveUtils.addToStringCollection(conf, "tmpjars", hadoopJarURLs);
        }
    }

    /**
     * Handle -hiveconf options, adding them to Configuration
     *
     * @param hiveconfArgs array of hiveconf args
     * @param conf Configuration
     */
    public static void processHiveconfOptions(String[] hiveconfArgs, Configuration conf) {
        for (String hiveconf : hiveconfArgs) {
            processHiveconfOption(conf, hiveconf);
        }
    }

    /**
     * Process -hiveconf option, adding it to Configuration appropriately.
     *
     * @param conf Configuration
     * @param hiveconf option to process
     */
    public static void processHiveconfOption(Configuration conf, String hiveconf) {
        String[] keyval = hiveconf.split("=", 2);
        if (keyval.length == 2) {
            String name = keyval[0];
            String value = keyval[1];
            if (name.equals("tmpjars") || name.equals("tmpfiles")) {
                addToStringCollection(conf, name, value);
            } else {
                conf.set(name, value);
            }
        }
    }

    /**
     * Add string to collection
     *
     * @param conf Configuration
     * @param key key to add
     * @param values values for collection
     */
    public static void addToStringCollection(Configuration conf, String key, String... values) {
        addToStringCollection(conf, key, Arrays.asList(values));
    }

    /**
     * Add string to collection
     *
     * @param conf Configuration
     * @param key to add
     * @param values values for collection
     */
    public static void addToStringCollection(Configuration conf, String key, Collection<String> values) {
        Collection<String> strings = conf.getStringCollection(key);
        strings.addAll(values);
        conf.setStrings(key, strings.toArray(new String[strings.size()]));
    }

    /**
     * Create a new VertexToHive
     *
     * @param <I> Vertex ID
     * @param <V> Vertex Value
     * @param <E> Edge Value
     * @param conf Configuration
     * @param schema Hive table schema
     * @return VertexToHive
     * @throws IOException on any instantiation errors
     */
    public static <I extends WritableComparable, V extends Writable, E extends Writable> VertexToHive<I, V, E> newVertexToHive(
            ImmutableClassesGiraphConfiguration<I, V, E> conf, HiveTableSchema schema) throws IOException {
        Class<? extends VertexToHive> klass = VERTEX_TO_HIVE_CLASS.get(conf);
        if (klass == null) {
            throw new IOException(VERTEX_TO_HIVE_CLASS.getKey() + " not set in conf");
        }
        return newInstance(klass, conf, schema);
    }

    /**
     * Create a new HiveToEdge
     *
     * @param <I> Vertex ID
     * @param <V> Vertex Value
     * @param <E> Edge Value
     * @param conf Configuration
     * @param schema Hive table schema
     * @return HiveToVertex
     */
    public static <I extends WritableComparable, V extends Writable, E extends Writable> HiveToEdge<I, E> newHiveToEdge(
            ImmutableClassesGiraphConfiguration<I, V, E> conf, HiveTableSchema schema) {
        Class<? extends HiveToEdge> klass = HIVE_EDGE_INPUT.getClass(conf);
        if (klass == null) {
            throw new IllegalArgumentException(HIVE_EDGE_INPUT.getClassOpt().getKey() + " not set in conf");
        }
        return newInstance(klass, conf, schema);
    }

    /**
     * Create a new HiveToVertex
     *
     * @param <I> Vertex ID
     * @param <V> Vertex Value
     * @param <E> Edge Value
     * @param conf Configuration
     * @param schema Hive table schema
     * @return HiveToVertex
     */
    public static <I extends WritableComparable, V extends Writable, E extends Writable> HiveToVertex<I, V, E> newHiveToVertex(
            ImmutableClassesGiraphConfiguration<I, V, E> conf, HiveTableSchema schema) {
        Class<? extends HiveToVertex> klass = HIVE_VERTEX_INPUT.getClass(conf);
        if (klass == null) {
            throw new IllegalArgumentException(HIVE_VERTEX_INPUT.getClassOpt().getKey() + " not set in conf");
        }
        return newInstance(klass, conf, schema);
    }

    /**
     * Create a new HiveToMapping
     *
     * @param conf ImmutableClassesGiraphConfiguration
     * @param schema HiveTableSchema
     * @param <I> vertexId type
     * @param <V> vertexValue type
     * @param <E> edgeValue type
     * @param <B> mappingTarget type
     * @return HiveToMapping
     */
    public static <I extends WritableComparable, V extends Writable, E extends Writable, B extends Writable> HiveToMapping<I, B> newHiveToMapping(
            ImmutableClassesGiraphConfiguration<I, V, E> conf, HiveTableSchema schema) {
        Class<? extends HiveToMapping> klass = HIVE_MAPPING_INPUT.getClass(conf);
        if (klass == null) {
            throw new IllegalArgumentException(HIVE_MAPPING_INPUT.getClassOpt().getKey() + " not set in conf");
        }
        return newInstance(klass, conf, schema);
    }

    /**
     * Create a new instance of a class, configuring it and setting the Hive table
     * schema if it supports those types.
     *
     * @param klass Class to create
     * @param conf {@link ImmutableClassesGiraphConfiguration} to configure with
     * @param schema {@link HiveTableSchema} from Hive to set
     * @param <I> Vertex ID
     * @param <V> Vertex Value
     * @param <E> Edge Value
     * @param <T> type being created
     * @return new object of type <T>
     */
    public static <I extends WritableComparable, V extends Writable, E extends Writable, T> T newInstance(
            Class<T> klass, ImmutableClassesGiraphConfiguration<I, V, E> conf, HiveTableSchema schema) {
        T object = ReflectionUtils.<T>newInstance(klass, conf);
        HiveTableSchemas.configure(object, schema);
        return object;
    }
}