azkaban.jobtype.ReportalHiveRunner.java Source code

Introduction

Here is the source code for azkaban.jobtype.ReportalHiveRunner.java
Source

/*
 * Copyright 2012 LinkedIn Corp.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package azkaban.jobtype;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Scanner;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.cli.CliDriver;
import org.apache.hadoop.hive.cli.CliSessionState;
import org.apache.hadoop.hive.cli.OptionsProcessor;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.shims.ShimLoader;

import azkaban.reportal.util.BoundedOutputStream;

public class ReportalHiveRunner extends ReportalAbstractRunner {

    public ReportalHiveRunner(String jobName, Properties props) {
        super(props);
    }

    @Override
    protected void runReportal() throws Exception {
        System.out.println("Reportal Hive: Setting up Hive");
        HiveConf conf = new HiveConf(SessionState.class);

        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
            conf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
        }

        File tempTSVFile = new File("./temp.tsv");
        OutputStream tsvTempOutputStream = new BoundedOutputStream(
                new BufferedOutputStream(new FileOutputStream(tempTSVFile)), outputCapacity);
        PrintStream logOut = System.out;

        // NOTE: It is critical to do this here so that log4j is reinitialized
        // before any of the other core hive classes are loaded
        // criccomini@linkedin.com: I disabled this because it appears to swallow
        // all future logging (even outside of hive).
        // SessionState.initHiveLog4j();

        String orig = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS);

        CliSessionState sessionState = new CliSessionState(conf);
        sessionState.in = System.in;
        sessionState.out = new PrintStream(tsvTempOutputStream, true, "UTF-8");
        sessionState.err = new PrintStream(logOut, true, "UTF-8");

        OptionsProcessor oproc = new OptionsProcessor();

        // Feed in Hive Args
        String[] args = buildHiveArgs();
        if (!oproc.process_stage1(args)) {
            throw new Exception("unable to parse options stage 1");
        }

        if (!oproc.process_stage2(sessionState)) {
            throw new Exception("unable to parse options stage 2");
        }

        // Set all properties specified via command line
        for (Map.Entry<Object, Object> item : sessionState.cmdProperties.entrySet()) {
            conf.set((String) item.getKey(), (String) item.getValue());
        }

        SessionState.start(sessionState);

        String expanded = expandHiveAuxJarsPath(orig);
        if (orig == null || orig.equals(expanded)) {
            System.out.println("Hive aux jars variable not expanded");
        } else {
            System.out.println("Expanded aux jars variable from [" + orig + "] to [" + expanded + "]");
            HiveConf.setVar(conf, HiveConf.ConfVars.HIVEAUXJARS, expanded);
        }

        if (!ShimLoader.getHadoopShims().usesJobShell()) {
            // hadoop-20 and above - we need to augment classpath using hiveconf
            // components
            // see also: code in ExecDriver.java
            ClassLoader loader = conf.getClassLoader();
            String auxJars = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS);

            System.out.println("Got auxJars = " + auxJars);

            if (StringUtils.isNotBlank(auxJars)) {
                loader = Utilities.addToClassPath(loader, StringUtils.split(auxJars, ","));
            }
            conf.setClassLoader(loader);
            Thread.currentThread().setContextClassLoader(loader);
        }

        CliDriver cli = new CliDriver();
        int returnValue = 0;
        String prefix = "";

        returnValue = cli.processLine("set hive.cli.print.header=true;");
        String[] queries = jobQuery.split("\n");
        for (String line : queries) {
            if (!prefix.isEmpty()) {
                prefix += '\n';
            }
            if (line.trim().endsWith(";") && !line.trim().endsWith("\\;")) {
                line = prefix + line;
                line = injectVariables(line);
                System.out.println("Reportal Hive: Running Hive Query: " + line);
                System.out.println("Reportal Hive: HiveConf HIVEAUXJARS: "
                        + HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS));
                returnValue = cli.processLine(line);
                prefix = "";
            } else {
                prefix = prefix + line;
                continue;
            }
        }

        tsvTempOutputStream.close();

        // convert tsv to csv and write it do disk
        System.out.println("Reportal Hive: Converting output");
        InputStream tsvTempInputStream = new BufferedInputStream(new FileInputStream(tempTSVFile));
        Scanner rowScanner = new Scanner(tsvTempInputStream);
        PrintStream csvOutputStream = new PrintStream(outputStream);
        while (rowScanner.hasNextLine()) {
            String tsvLine = rowScanner.nextLine();
            // strip all quotes, and then quote the columns
            csvOutputStream.println("\"" + tsvLine.replace("\"", "").replace("\t", "\",\"") + "\"");
        }
        rowScanner.close();
        csvOutputStream.close();

        // Flush the temp file out
        tempTSVFile.delete();

        if (returnValue != 0) {
            throw new Exception("Hive query finished with a non zero return code");
        }

        System.out.println("Reportal Hive: Ended successfully");
    }

    private String[] buildHiveArgs() {
        String hadoopBinDir = props.getString("hadoop.dir.bin");
        String hadoopConfDir = props.getString("hadoop.dir.conf");
        String hiveAuxJarsPath = props.getString("hive.aux.jars.path");

        List<String> confBuilder = new ArrayList<String>();

        if (proxyUser != null) {
            confBuilder.add("hive.exec.scratchdir=/tmp/hive-" + proxyUser);
        }
        if (hadoopBinDir != null) {
            confBuilder.add("hadoop.bin.path=" + hadoopBinDir);
        }
        if (hadoopConfDir != null) {
            confBuilder.add("hadoop.config.dir=" + hadoopConfDir);
        }
        if (hiveAuxJarsPath != null) {
            hiveAuxJarsPath = "file://" + hiveAuxJarsPath.replace(",", ",file://");
            confBuilder.add("hive.aux.jars.path=" + hiveAuxJarsPath);
        }
        if (jobTitle != null) {
            confBuilder.add("mapred.job.name=\"Reportal: " + jobTitle + "\"");
        }

        // if (logDir != null) {
        // confBuilder.add("hive.log.dir=" + logDir);
        // }
        // if (logFile != null) {
        // confBuilder.add("hive.log.file=" + logFile);
        // }
        // if (mapredJobQueueName != null) {
        // confBuilder.add("mapred.job.queue.name=" + mapredJobQueueName);
        // }

        String[] args = new String[confBuilder.size() * 2];

        for (int i = 0; i < confBuilder.size(); i++) {
            args[i * 2] = "--hiveconf";
            args[i * 2 + 1] = confBuilder.get(i);
        }

        return args;
    }

    /**
     * Normally hive.aux.jars.path is expanded from just being a path to the
     * full list of files in the directory by the hive shell script. Since
     * we normally won't be running from the script, it's up to us to do that
     * work here. We use a heuristic that if there is no occurrence of ".jar"
     * in the original, it needs expansion. Otherwise it's already been done
     * for us.
     * Also, surround the files with uri niceities.
     */
    static String expandHiveAuxJarsPath(String original) throws IOException {
        if (original == null || original.endsWith(".jar"))
            return original;

        File[] files = new File(original).listFiles();

        if (files == null || files.length == 0) {
            return original;
        }

        return filesToURIString(files);

    }

    static String filesToURIString(File[] files) throws IOException {
        StringBuffer sb = new StringBuffer();
        for (int i = 0; i < files.length; i++) {
            sb.append("file:///").append(files[i].getCanonicalPath());
            if (i != files.length - 1)
                sb.append(",");
        }

        return sb.toString();
    }
}