Java tutorial
/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.sqoop.hive; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.conf.Configuration; import com.cloudera.sqoop.SqoopOptions; import com.cloudera.sqoop.manager.ConnManager; import com.cloudera.sqoop.util.Executor; import com.cloudera.sqoop.util.ExitSecurityException; import com.cloudera.sqoop.util.LoggingAsyncSink; import com.cloudera.sqoop.util.SubprocessSecurityManager; /** * Utility to import a table into the Hive metastore. Manages the connection * to Hive itself as well as orchestrating the use of the other classes in this * package. */ public class HiveImport { public static final Log LOG = LogFactory.getLog(HiveImport.class.getName()); private SqoopOptions options; private ConnManager connManager; private Configuration configuration; private boolean generateOnly; /** Entry point through which Hive invocation should be attempted. */ private static final String HIVE_MAIN_CLASS = "org.apache.hadoop.hive.cli.CliDriver"; public HiveImport(final SqoopOptions opts, final ConnManager connMgr, final Configuration conf, final boolean generateOnly) { this.options = opts; this.connManager = connMgr; this.configuration = conf; this.generateOnly = generateOnly; } /** * @return the filename of the hive executable to run to do the import */ private String getHiveBinPath() { // If the user has $HIVE_HOME set, then use $HIVE_HOME/bin/hive if it // exists. // Fall back to just plain 'hive' and hope it's in the path. String hiveHome = options.getHiveHome(); if (null == hiveHome) { return "hive"; } Path p = new Path(hiveHome); p = new Path(p, "bin"); p = new Path(p, "hive"); String hiveBinStr = p.toString(); if (new File(hiveBinStr).exists()) { return hiveBinStr; } else { return "hive"; } } /** * If we used a MapReduce-based upload of the data, remove the _logs dir * from where we put it, before running Hive LOAD DATA INPATH. */ private void removeTempLogs(String tableName) throws IOException { FileSystem fs = FileSystem.get(configuration); Path tablePath; if (null != tableName) { String warehouseDir = options.getWarehouseDir(); if (warehouseDir != null) { tablePath = new Path(new Path(warehouseDir), tableName); } else { tablePath = new Path(tableName); } } else { // --table option is not used, so use the target dir instead tablePath = new Path(options.getTargetDir()); } Path logsPath = new Path(tablePath, "_logs"); if (fs.exists(logsPath)) { LOG.info("Removing temporary files from import process: " + logsPath); if (!fs.delete(logsPath, true)) { LOG.warn("Could not delete temporary files; " + "continuing with import, but it may fail."); } } } /** * @return true if we're just generating the DDL for the import, but * not actually running it (i.e., --generate-only mode). If so, don't * do any side-effecting actions in Hive. */ private boolean isGenerateOnly() { return generateOnly; } /** * @return a File object that can be used to write the DDL statement. * If we're in gen-only mode, this should be a file in the outdir, named * after the Hive table we're creating. If we're in import mode, this should * be a one-off temporary file. */ private File getScriptFile(String outputTableName) throws IOException { if (!isGenerateOnly()) { return File.createTempFile("hive-script-", ".txt", new File(options.getTempDir())); } else { return new File(new File(options.getCodeOutputDir()), outputTableName + ".q"); } } /** * Perform the import of data from an HDFS path to a Hive table. * * @param inputTableName the name of the table as loaded into HDFS * @param outputTableName the name of the table to create in Hive. * @param createOnly if true, run the CREATE TABLE statement but not * LOAD DATA. */ public void importTable(String inputTableName, String outputTableName, boolean createOnly) throws IOException { if (!isGenerateOnly()) { removeTempLogs(inputTableName); LOG.info("Loading uploaded data into Hive"); } if (null == outputTableName) { outputTableName = inputTableName; } LOG.debug("Hive.inputTable: " + inputTableName); LOG.debug("Hive.outputTable: " + outputTableName); // For testing purposes against our mock hive implementation, // if the sysproperty "expected.script" is set, we set the EXPECTED_SCRIPT // environment variable for the child hive process. We also disable // timestamp comments so that we have deterministic table creation scripts. String expectedScript = System.getProperty("expected.script"); List<String> env = Executor.getCurEnvpStrings(); boolean debugMode = expectedScript != null; if (debugMode) { env.add("EXPECTED_SCRIPT=" + expectedScript); env.add("TMPDIR=" + options.getTempDir()); } // generate the HQL statements to run. TableDefWriter tableWriter = new TableDefWriter(options, connManager, inputTableName, outputTableName, configuration, !debugMode); String createTableStr = tableWriter.getCreateTableStmt() + ";\n"; String loadDataStmtStr = tableWriter.getLoadDataStmt() + ";\n"; // write them to a script file. File scriptFile = getScriptFile(outputTableName); try { String filename = scriptFile.toString(); BufferedWriter w = null; try { FileOutputStream fos = new FileOutputStream(scriptFile); w = new BufferedWriter(new OutputStreamWriter(fos)); w.write(createTableStr, 0, createTableStr.length()); if (!createOnly) { w.write(loadDataStmtStr, 0, loadDataStmtStr.length()); } } catch (IOException ioe) { LOG.error("Error writing Hive load-in script: " + ioe.toString()); ioe.printStackTrace(); throw ioe; } finally { if (null != w) { try { w.close(); } catch (IOException ioe) { LOG.warn("IOException closing stream to Hive script: " + ioe.toString()); } } } if (!isGenerateOnly()) { executeScript(filename, env); LOG.info("Hive import complete."); } } finally { if (!isGenerateOnly()) { // User isn't interested in saving the DDL. Remove the file. if (!scriptFile.delete()) { LOG.warn("Could not remove temporary file: " + scriptFile.toString()); // try to delete the file later. scriptFile.deleteOnExit(); } } } } @SuppressWarnings("unchecked") /** * Execute the script file via Hive. * If Hive's jars are on the classpath, run it in the same process. * Otherwise, execute the file with 'bin/hive'. * * @param filename The script file to run. * @param env the environment strings to pass to any subprocess. * @throws IOException if Hive did not exit successfully. */ private void executeScript(String filename, List<String> env) throws IOException { SubprocessSecurityManager subprocessSM = null; try { Class cliDriverClass = Class.forName(HIVE_MAIN_CLASS); // We loaded the CLI Driver in this JVM, so we will just // call it in-process. The CliDriver class has a method: // void main(String [] args) throws Exception. // // We'll call that here to invoke 'hive -f scriptfile'. // Because this method will call System.exit(), we use // a SecurityManager to prevent this. LOG.debug("Using in-process Hive instance."); subprocessSM = new SubprocessSecurityManager(); subprocessSM.install(); // Create the argv for the Hive Cli Driver. String[] argArray = new String[2]; argArray[0] = "-f"; argArray[1] = filename; // And invoke the static method on this array. Method mainMethod = cliDriverClass.getMethod("main", argArray.getClass()); mainMethod.invoke(null, (Object) argArray); } catch (ClassNotFoundException cnfe) { // Hive is not on the classpath. Run externally. // This is not an error path. LOG.debug("Using external Hive process."); executeExternalHiveScript(filename, env); } catch (NoSuchMethodException nsme) { // Could not find a handle to the main() method. throw new IOException("Could not access CliDriver.main()", nsme); } catch (IllegalAccessException iae) { // Error getting a handle on the main() method. throw new IOException("Could not access CliDriver.main()", iae); } catch (InvocationTargetException ite) { // We ran CliDriver.main() and an exception was thrown from within Hive. // This may have been the ExitSecurityException triggered by the // SubprocessSecurityManager. If so, handle it. Otherwise, wrap in // an IOException and rethrow. Throwable cause = ite.getCause(); if (cause instanceof ExitSecurityException) { ExitSecurityException ese = (ExitSecurityException) cause; int status = ese.getExitStatus(); if (status != 0) { throw new IOException("Hive CliDriver exited with status=" + status); } } else { throw new IOException("Exception thrown in Hive", ite); } } finally { if (null != subprocessSM) { // Uninstall the SecurityManager used to trap System.exit(). subprocessSM.uninstall(); } } } /** * Execute Hive via an external 'bin/hive' process. * @param filename the Script file to run. * @param env the environment strings to pass to any subprocess. * @throws IOException if Hive did not exit successfully. */ private void executeExternalHiveScript(String filename, List<String> env) throws IOException { // run Hive on the script and note the return code. String hiveExec = getHiveBinPath(); ArrayList<String> args = new ArrayList<String>(); args.add(hiveExec); args.add("-f"); args.add(filename); LoggingAsyncSink logSink = new LoggingAsyncSink(LOG); int ret = Executor.exec(args.toArray(new String[0]), env.toArray(new String[0]), logSink, logSink); if (0 != ret) { throw new IOException("Hive exited with status " + ret); } } }