org.apache.hadoop.hive.ql.exec.spark.HiveSparkClientFactory.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.ql.exec.spark.HiveSparkClientFactory.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.spark;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import org.apache.commons.compress.utils.CharsetNames;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.hive.common.LogUtils;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hive.spark.client.SparkClientUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConfUtil;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hive.spark.client.rpc.RpcConfiguration;
import org.apache.spark.SparkConf;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.Sets;

public class HiveSparkClientFactory {
    protected static final transient Logger LOG = LoggerFactory.getLogger(HiveSparkClientFactory.class);

    private static final String SPARK_DEFAULT_CONF_FILE = "spark-defaults.conf";
    private static final String SPARK_DEFAULT_MASTER = "yarn";
    private static final String SPARK_DEFAULT_DEPLOY_MODE = "cluster";
    private static final String SPARK_DEFAULT_APP_NAME = "Hive on Spark";
    private static final String SPARK_DEFAULT_SERIALIZER = "org.apache.spark.serializer.KryoSerializer";
    private static final String SPARK_DEFAULT_REFERENCE_TRACKING = "false";
    private static final String SPARK_WAIT_APP_COMPLETE = "spark.yarn.submit.waitAppCompletion";
    private static final String SPARK_DEPLOY_MODE = "spark.submit.deployMode";

    public static HiveSparkClient createHiveSparkClient(HiveConf hiveconf) throws Exception {
        Map<String, String> sparkConf = initiateSparkConf(hiveconf);
        // Submit spark job through local spark context while spark master is local mode, otherwise submit
        // spark job through remote spark context.
        String master = sparkConf.get("spark.master");
        if (master.equals("local") || master.startsWith("local[")) {
            // With local spark context, all user sessions share the same spark context.
            return LocalHiveSparkClient.getInstance(generateSparkConf(sparkConf));
        } else {
            return new RemoteHiveSparkClient(hiveconf, sparkConf);
        }
    }

    public static Map<String, String> initiateSparkConf(HiveConf hiveConf) {
        Map<String, String> sparkConf = new HashMap<String, String>();
        HBaseConfiguration.addHbaseResources(hiveConf);

        // set default spark configurations.
        sparkConf.put("spark.master", SPARK_DEFAULT_MASTER);
        final String appNameKey = "spark.app.name";
        String appName = hiveConf.get(appNameKey);
        if (appName == null) {
            appName = SPARK_DEFAULT_APP_NAME;
        }
        sparkConf.put(appNameKey, appName);
        sparkConf.put("spark.serializer", SPARK_DEFAULT_SERIALIZER);
        sparkConf.put("spark.kryo.referenceTracking", SPARK_DEFAULT_REFERENCE_TRACKING);

        // load properties from spark-defaults.conf.
        InputStream inputStream = null;
        try {
            inputStream = HiveSparkClientFactory.class.getClassLoader()
                    .getResourceAsStream(SPARK_DEFAULT_CONF_FILE);
            if (inputStream != null) {
                LOG.info("loading spark properties from:" + SPARK_DEFAULT_CONF_FILE);
                Properties properties = new Properties();
                properties.load(new InputStreamReader(inputStream, CharsetNames.UTF_8));
                for (String propertyName : properties.stringPropertyNames()) {
                    if (propertyName.startsWith("spark")) {
                        String value = properties.getProperty(propertyName);
                        sparkConf.put(propertyName, properties.getProperty(propertyName));
                        LOG.info(String.format("load spark property from %s (%s -> %s).", SPARK_DEFAULT_CONF_FILE,
                                propertyName, LogUtils.maskIfPassword(propertyName, value)));
                    }
                }
            }
        } catch (IOException e) {
            LOG.info("Failed to open spark configuration file:" + SPARK_DEFAULT_CONF_FILE, e);
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                    LOG.debug("Failed to close inputstream.", e);
                }
            }
        }

        // load properties from hive configurations, including both spark.* properties,
        // properties for remote driver RPC, and yarn properties for Spark on YARN mode.
        String sparkMaster = hiveConf.get("spark.master");
        if (sparkMaster == null) {
            sparkMaster = sparkConf.get("spark.master");
            hiveConf.set("spark.master", sparkMaster);
        }
        String deployMode = null;
        if (!SparkClientUtilities.isLocalMaster(sparkMaster)) {
            deployMode = hiveConf.get(SPARK_DEPLOY_MODE);
            if (deployMode == null) {
                deployMode = sparkConf.get(SPARK_DEPLOY_MODE);
                if (deployMode == null) {
                    deployMode = SparkClientUtilities.getDeployModeFromMaster(sparkMaster);
                }
                if (deployMode == null) {
                    deployMode = SPARK_DEFAULT_DEPLOY_MODE;
                }
                hiveConf.set(SPARK_DEPLOY_MODE, deployMode);
            }
        }
        if (SessionState.get() != null && SessionState.get().getConf() != null) {
            SessionState.get().getConf().set("spark.master", sparkMaster);
            if (deployMode != null) {
                SessionState.get().getConf().set(SPARK_DEPLOY_MODE, deployMode);
            }
        }
        if (SparkClientUtilities.isYarnClusterMode(sparkMaster, deployMode)) {
            sparkConf.put("spark.yarn.maxAppAttempts", "1");
        }
        for (Map.Entry<String, String> entry : hiveConf) {
            String propertyName = entry.getKey();
            if (propertyName.startsWith("spark")) {
                String value = hiveConf.get(propertyName);
                sparkConf.put(propertyName, value);
                LOG.info(String.format("load spark property from hive configuration (%s -> %s).", propertyName,
                        LogUtils.maskIfPassword(propertyName, value)));
            } else if (propertyName.startsWith("yarn") && SparkClientUtilities.isYarnMaster(sparkMaster)) {
                String value = hiveConf.get(propertyName);
                // Add spark.hadoop prefix for yarn properties as SparkConf only accept properties
                // started with spark prefix, Spark would remove spark.hadoop prefix lately and add
                // it to its hadoop configuration.
                sparkConf.put("spark.hadoop." + propertyName, value);
                LOG.info(String.format("load yarn property from hive configuration in %s mode (%s -> %s).",
                        sparkMaster, propertyName, LogUtils.maskIfPassword(propertyName, value)));
            } else if (propertyName.equals(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY)) {
                String value = hiveConf.get(propertyName);
                if (value != null && !value.isEmpty()) {
                    sparkConf.put("spark.hadoop." + propertyName, value);
                }
            } else if (propertyName.startsWith("hbase") || propertyName.startsWith("zookeeper.znode")) {
                // Add HBase related configuration to Spark because in security mode, Spark needs it
                // to generate hbase delegation token for Spark. This is a temp solution to deal with
                // Spark problem.
                String value = hiveConf.get(propertyName);
                sparkConf.put("spark.hadoop." + propertyName, value);
                LOG.info(String.format("load HBase configuration (%s -> %s).", propertyName,
                        LogUtils.maskIfPassword(propertyName, value)));
            } else if (propertyName.startsWith("oozie")) {
                String value = hiveConf.get(propertyName);
                sparkConf.put("spark." + propertyName, value);
                LOG.info(String.format("Pass Oozie configuration (%s -> %s).", propertyName,
                        LogUtils.maskIfPassword(propertyName, value)));
            }

            if (RpcConfiguration.HIVE_SPARK_RSC_CONFIGS.contains(propertyName)) {
                String value = RpcConfiguration.getValue(hiveConf, propertyName);
                sparkConf.put(propertyName, value);
                LOG.info(String.format("load RPC property from hive configuration (%s -> %s).", propertyName,
                        LogUtils.maskIfPassword(propertyName, value)));
            }
        }

        Set<String> classes = Sets.newHashSet(Splitter.on(",").trimResults().omitEmptyStrings()
                .split(Strings.nullToEmpty(sparkConf.get("spark.kryo.classesToRegister"))));
        classes.add(Writable.class.getName());
        classes.add(VectorizedRowBatch.class.getName());
        classes.add(BytesWritable.class.getName());
        classes.add(HiveKey.class.getName());
        sparkConf.put("spark.kryo.classesToRegister", Joiner.on(",").join(classes));

        // set yarn queue name
        final String sparkQueueNameKey = "spark.yarn.queue";
        if (SparkClientUtilities.isYarnMaster(sparkMaster) && hiveConf.get(sparkQueueNameKey) == null) {
            String queueName = hiveConf.get("mapreduce.job.queuename");
            if (queueName != null) {
                sparkConf.put(sparkQueueNameKey, queueName);
            }
        }

        // Disable it to avoid verbose app state report in yarn-cluster mode
        if (SparkClientUtilities.isYarnClusterMode(sparkMaster, deployMode)
                && sparkConf.get(SPARK_WAIT_APP_COMPLETE) == null) {
            sparkConf.put(SPARK_WAIT_APP_COMPLETE, "false");
        }

        // Set the credential provider passwords if found, if there is job specific password
        // the credential provider location is set directly in the execute method of LocalSparkClient
        // and submit method of RemoteHiveSparkClient when the job config is created
        String password = HiveConfUtil.getJobCredentialProviderPassword(hiveConf);
        if (password != null) {
            addCredentialProviderPassword(sparkConf, password);
        }
        return sparkConf;
    }

    private static void addCredentialProviderPassword(Map<String, String> sparkConf, String jobCredstorePassword) {
        sparkConf.put("spark.yarn.appMasterEnv.HADOOP_CREDSTORE_PASSWORD", jobCredstorePassword);
        sparkConf.put("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", jobCredstorePassword);
    }

    static SparkConf generateSparkConf(Map<String, String> conf) {
        SparkConf sparkConf = new SparkConf(false);
        for (Map.Entry<String, String> entry : conf.entrySet()) {
            sparkConf.set(entry.getKey(), entry.getValue());
        }
        return sparkConf;
    }
}