Java tutorial
/* * Copyright 2012 LinkedIn Corp. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package azkaban.jobtype; import static azkaban.security.commons.HadoopSecurityManager.ENABLE_PROXYING; import static azkaban.security.commons.HadoopSecurityManager.OBTAIN_BINARY_TOKEN; import static org.apache.hadoop.security.UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION; import azkaban.flow.CommonJobProperties; import azkaban.jobExecutor.JavaProcessJob; import azkaban.security.commons.HadoopSecurityManager; import azkaban.utils.Props; import azkaban.utils.StringUtils; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map.Entry; import java.util.Properties; import java.util.StringTokenizer; import org.apache.commons.lang.math.NumberUtils; import org.apache.log4j.Logger; import org.apache.tools.ant.DirectoryScanner; /** * <pre> * The Azkaban adaptor for running a Spark Submit job. * Use this in conjunction with {@link azkaban.jobtype.HadoopSecureSparkWrapper} * * This class is used by azkaban executor to build the classpath, main args, env and jvm props * for HadoopSecureSparkWrapper. Executor will then launch the job process and run * HadoopSecureSparkWrapper. HadoopSecureSparkWrapper will be the Spark client wrapper * that uses the main args to launch spark-submit. * * Expect the following jobtype property: * * spark.home (client default SPARK_HOME if user doesn't give a spark-version) * Conf will be either SPARK_CONF_DIR(we do not override it) or {spark.home}/conf * * spark.1.6.0.home (spark.{version}.home is REQUIRED for the {version} that we want to support. * e.g. user can use spark 1.6.0 by setting spark-version=1.6.0 in their job property. * This class will then look for plugin property spark.1.6.0.home to get the proper spark * bin/conf to launch the client) * * spark.1.6.0.conf (OPTIONAL. spark.{version}.conf is the conf used for the {version}. * If not specified, the conf of this {version} will be spark.{version}.home/conf * * spark.base.dir To reduce dependency on azkban-jobtype plugin properties every time new spark binary is available, * this property needs to be set. It specifies path where spark binaries are kept. * If spark.{sparkVersion}.home is set in commonprivate.properties/private.properties, * then that will be returned. If spark.{sparkVersion}.home is not set and spark.base.dir is set then * it will retrieve Spark directory inside spark.base.dir, matching spark.home.prefix + sparkVersion pattern. * * spark.dynamic.res.alloc.enforced (set to true if we want to enforce dynamic resource allocation policy. * Enabling dynamic allocation policy for spark job type is different from enabling dynamic * allocation feature for Spark. This config inside Spark job type is to enforce dynamic * allocation feature for all Spark applications submitted via Azkaban Spark job type. * If set to true, our client wrapper will make sure user does not overrides dynamic * allocation related conf. It expects the presence of SPARK-13723 in the version of * Spark deployed in the cluster, so user explicitly setting num-executors will not * disable dynamic allocation. If this parameter is enabled, we suggest the spark cluster * should set up dynamic allocation properly and set related conf in spark-default.conf) * * spark.node.labeling.enforced (set to true if we want to enforce node labeling policy. * Enabling node labeling policy for spark job type is different from enabling node * labeling feature in YARN. This config inside Spark job type is to enforce node * labeling is used for all Spark applications submitted via Azkaban Spark job type. * If set to true, our client wrapper will ignore user specified queue. If this * is enabled, we suggest to enable node labeling in yarn cluster, and also set * queue param in spark-default.conf) * * * </pre> * * @see azkaban.jobtype.HadoopSecureSparkWrapper */ public class HadoopSparkJob extends JavaProcessJob { // SPARK_HOME ENV VAR for HadoopSecureSparkWrapper(Spark Client) public static final String SPARK_HOME_ENV_VAR = "SPARK_HOME"; // SPARK_CONF_DIR ENV VAR for HadoopSecureSparkWrapper(Spark Client) public static final String SPARK_CONF_DIR_ENV_VAR = "SPARK_CONF_DIR"; // SPARK JOBTYPE PROPERTY spark.dynamic.res.alloc.enforced public static final String SPARK_DYNAMIC_RES_JOBTYPE_PROPERTY = "spark.dynamic.res.alloc.enforced"; // HadoopSecureSparkWrapper ENV VAR if spark.dynamic.res.alloc.enforced is set to true public static final String SPARK_DYNAMIC_RES_ENV_VAR = "SPARK_DYNAMIC_RES_ENFORCED"; // SPARK JOBTYPE PROPERTY spark.node.labeling.enforced public static final String SPARK_NODE_LABELING_JOBTYPE_PROPERTY = "spark.node.labeling.enforced"; // HadoopSecureSparkWrapper ENV VAR if spark.node.labeling.enforced is set to true public static final String SPARK_NODE_LABELING_ENV_VAR = "SPARK_NODE_LABELING_ENFORCED"; // Jobtype property for whether to enable auto node labeling for Spark applications // submitted via the Spark jobtype. public static final String SPARK_AUTO_NODE_LABELING_JOBTYPE_PROPERTY = "spark.auto.node.labeling.enabled"; // Env var to be passed to {@HadoopSecureSparkWrapper} for whether auto node labeling // is enabled public static final String SPARK_AUTO_NODE_LABELING_ENV_VAR = "SPARK_AUTO_NODE_LABELING_ENABLED"; // Jobtype property to configure the desired node label expression when auto node // labeling is enabled and min mem/vcore ratio is met. public static final String SPARK_DESIRED_NODE_LABEL_JOBTYPE_PROPERTY = "spark.desired.node.label"; // Env var to be passed to {@HadoopSecureSparkWrapper} for the desired node label expression public static final String SPARK_DESIRED_NODE_LABEL_ENV_VAR = "SPARK_DESIRED_NODE_LABEL"; // Jobtype property to configure the minimum mem/vcore ratio for a Spark application's // executor to be submitted with the desired node label expression. public static final String SPARK_MIN_MEM_VCORE_RATIO_JOBTYPE_PROPERTY = "spark.min.mem.vore.ratio"; // Env var to be passed to {@HadoopSecureSparkWrapper} for the value of minimum // mem/vcore ratio public static final String SPARK_MIN_MEM_VCORE_RATIO_ENV_VAR = "SPARK_MIN_MEM_VCORE_RATIO"; // Jobtype property to configure the minimum memory size (in GB) for a Spark application's // executor to be submitted with the desired node label expression. public static final String SPARK_MIN_MEM_SIZE_JOBTYPE_PROPERTY = "spark.min.memory-gb.size"; // Env var to be passed to {@HadoopSecureSparkWrapper} for the value of minimum // memory size in GB public static final String SPARK_MIN_MEM_SIZE_ENV_VAR = "SPARK_MIN_MEM_GB_SIZE"; // Jobtype property to denote base directory where spark binaries are placed public static final String SPARK_BASE_DIR = "spark.base.dir"; // Jobtype property to configure prefix of directory of Spark binaries public static final String SPARK_HOME_PREFIX = "spark.home.prefix"; // Jobtype property to configure regex which will be replaced by SPARK_VERSION_REGEX_TO_REPLACE_WITH in Spark version // provided by user as a spark-version parameter public static final String SPARK_VERSION_REGEX_TO_REPLACE = "spark.version.regex.to.replace"; // Jobtype property to configure regex which will be replacing SPARK_VERSION_REGEX_TO_REPLACE in Spark version // provided by user as a spark-version parameter. public static final String SPARK_VERSION_REGEX_TO_REPLACE_WITH = "spark.version.regex.to.replace.with"; // Jobtype property to configure reference document for available spark versions which can be referred by users // in case they don't know which are the valid spark versions public static final String SPARK_REFERENCE_DOCUMENT = "spark.reference.document"; // Azkaban/Java params private static final String HADOOP_SECURE_SPARK_WRAPPER = HadoopSecureSparkWrapper.class.getName(); // Spark configuration property to specify additional Namenodes to fetch tokens for private static final String SPARK_CONF_ADDITIONAL_NAMENODES = "spark.yarn.access.namenodes"; // security variables private String userToProxy = null; private boolean shouldProxy = false; private boolean obtainTokens = false; private File tokenFile = null; private HadoopSecurityManager hadoopSecurityManager; public HadoopSparkJob(final String jobid, final Props sysProps, final Props jobProps, final Logger log) { super(jobid, sysProps, jobProps, log); getJobProps().put(CommonJobProperties.JOB_ID, jobid); this.shouldProxy = getSysProps().getBoolean(ENABLE_PROXYING, false); getJobProps().put(ENABLE_PROXYING, Boolean.toString(this.shouldProxy)); this.obtainTokens = getSysProps().getBoolean(OBTAIN_BINARY_TOKEN, false); if (this.shouldProxy) { getLog().info("Initiating hadoop security manager."); try { this.hadoopSecurityManager = HadoopJobUtils.loadHadoopSecurityManager(getSysProps(), log); } catch (final RuntimeException e) { throw new RuntimeException("Failed to get hadoop security manager!" + e); } } } static String testableGetMainArguments(final Props jobProps, final String workingDir, final Logger log) { // if we ever need to recreate a failure scenario in the test case log.debug(jobProps); log.debug(workingDir); final List<String> argList = new ArrayList<>(); // special case handling for DRIVER_JAVA_OPTIONS argList.add(SparkJobArg.DRIVER_JAVA_OPTIONS.sparkParamName); final StringBuilder driverJavaOptions = new StringBuilder(); // note the default java opts are communicated through the hadoop conf and // added in the // HadoopSecureSparkWrapper if (jobProps.containsKey(SparkJobArg.DRIVER_JAVA_OPTIONS.azPropName)) { driverJavaOptions.append(" " + jobProps.getString(SparkJobArg.DRIVER_JAVA_OPTIONS.azPropName)); } argList.add(driverJavaOptions.toString()); // Note that execution_jar and params must appear in order, and as the last // 2 params // Because of the position they are specified in the SparkJobArg class, this // should not be an // issue for (final SparkJobArg sparkJobArg : SparkJobArg.values()) { if (!sparkJobArg.needSpecialTreatment) { handleStandardArgument(jobProps, argList, sparkJobArg); } else if (sparkJobArg.equals(SparkJobArg.SPARK_JARS)) { sparkJarsHelper(jobProps, workingDir, log, argList); } else if (sparkJobArg.equals(SparkJobArg.SPARK_CONF_PREFIX)) { sparkConfPrefixHelper(jobProps, argList); } else if (sparkJobArg.equals(SparkJobArg.DRIVER_JAVA_OPTIONS)) { // do nothing because already handled above } else if (sparkJobArg.equals(SparkJobArg.SPARK_FLAG_PREFIX)) { sparkFlagPrefixHelper(jobProps, argList); } else if (sparkJobArg.equals(SparkJobArg.EXECUTION_JAR)) { executionJarHelper(jobProps, workingDir, log, argList); } else if (sparkJobArg.equals(SparkJobArg.PARAMS)) { paramsHelper(jobProps, argList); } else if (sparkJobArg.equals(SparkJobArg.SPARK_VERSION)) { // do nothing since this arg is not a spark-submit argument // it is only used in getClassPaths() below } } return StringUtils.join((Collection<String>) argList, SparkJobArg.delimiter); } private static void paramsHelper(final Props jobProps, final List<String> argList) { if (jobProps.containsKey(SparkJobArg.PARAMS.azPropName)) { final String params = jobProps.getString(SparkJobArg.PARAMS.azPropName); final String[] paramsList = params.split(" "); for (final String s : paramsList) { argList.add(s); } } } private static void executionJarHelper(final Props jobProps, final String workingDir, final Logger log, final List<String> argList) { if (jobProps.containsKey(SparkJobArg.EXECUTION_JAR.azPropName)) { final String executionJarName = HadoopJobUtils.resolveExecutionJarName(workingDir, jobProps.getString(SparkJobArg.EXECUTION_JAR.azPropName), log); argList.add(executionJarName); } } private static void sparkFlagPrefixHelper(final Props jobProps, final List<String> argList) { for (final Entry<String, String> entry : jobProps.getMapByPrefix(SparkJobArg.SPARK_FLAG_PREFIX.azPropName) .entrySet()) { if ("true".equalsIgnoreCase(entry.getValue())) { argList.add(SparkJobArg.SPARK_FLAG_PREFIX.sparkParamName + entry.getKey()); } } } private static void sparkJarsHelper(final Props jobProps, final String workingDir, final Logger log, final List<String> argList) { final String propSparkJars = jobProps.getString(SparkJobArg.SPARK_JARS.azPropName, ""); final String jarList = HadoopJobUtils.resolveWildCardForJarSpec(workingDir, propSparkJars, log); if (jarList.length() > 0) { argList.add(SparkJobArg.SPARK_JARS.sparkParamName); argList.add(jarList); } } private static void sparkConfPrefixHelper(final Props jobProps, final List<String> argList) { for (final Entry<String, String> entry : jobProps.getMapByPrefix(SparkJobArg.SPARK_CONF_PREFIX.azPropName) .entrySet()) { argList.add(SparkJobArg.SPARK_CONF_PREFIX.sparkParamName); final String sparkConfKeyVal = String.format("%s=%s", entry.getKey(), entry.getValue()); argList.add(sparkConfKeyVal); } } private static void handleStandardArgument(final Props jobProps, final List<String> argList, final SparkJobArg sparkJobArg) { if (jobProps.containsKey(sparkJobArg.azPropName)) { argList.add(sparkJobArg.sparkParamName); argList.add(jobProps.getString(sparkJobArg.azPropName)); } } private static String getSourcePathFromClass(final Class<?> containedClass) { File file = new File(containedClass.getProtectionDomain().getCodeSource().getLocation().getPath()); if (!file.isDirectory() && file.getName().endsWith(".class")) { final String name = containedClass.getName(); final StringTokenizer tokenizer = new StringTokenizer(name, "."); while (tokenizer.hasMoreTokens()) { tokenizer.nextElement(); file = file.getParentFile(); } return file.getPath(); } else { return containedClass.getProtectionDomain().getCodeSource().getLocation().getPath(); } } /** * Add additional namenodes specified in the Spark Configuration * ({@link #SPARK_CONF_ADDITIONAL_NAMENODES}) to the Props provided. * @param props Props to add additional namenodes to. * @see HadoopJobUtils#addAdditionalNamenodesToProps(Props, String) */ void addAdditionalNamenodesFromConf(final Props props) { final String sparkConfDir = getSparkLibConf()[1]; final File sparkConfFile = new File(sparkConfDir, "spark-defaults.conf"); try { final InputStreamReader inReader = new InputStreamReader(new FileInputStream(sparkConfFile), StandardCharsets.UTF_8); // Use Properties to avoid needing Spark on our classpath final Properties sparkProps = new Properties(); sparkProps.load(inReader); inReader.close(); final String additionalNamenodes = sparkProps.getProperty(SPARK_CONF_ADDITIONAL_NAMENODES); if (additionalNamenodes != null && additionalNamenodes.length() > 0) { getLog().info("Found property " + SPARK_CONF_ADDITIONAL_NAMENODES + " = " + additionalNamenodes + "; setting additional namenodes"); HadoopJobUtils.addAdditionalNamenodesToProps(props, additionalNamenodes); } } catch (final IOException e) { getLog().warn("Unable to load Spark configuration; not adding any additional " + "namenode delegation tokens.", e); } } @Override public void run() throws Exception { HadoopConfigurationInjector.prepareResourcesToInject(getJobProps(), getWorkingDirectory()); if (this.shouldProxy && this.obtainTokens) { this.userToProxy = getJobProps().getString(HadoopSecurityManager.USER_TO_PROXY); getLog().info("Need to proxy. Getting tokens."); // get tokens in to a file, and put the location in props final Props props = new Props(); props.putAll(getJobProps()); props.putAll(getSysProps()); addAdditionalNamenodesFromConf(props); this.tokenFile = HadoopJobUtils.getHadoopTokens(this.hadoopSecurityManager, props, getLog()); getJobProps().put("env." + HADOOP_TOKEN_FILE_LOCATION, this.tokenFile.getAbsolutePath()); } // If we enable dynamic resource allocation or node labeling in jobtype property, // then set proper env var for client wrapper(HadoopSecureSparkWrapper) to modify spark job conf // before calling spark-submit to enforce every spark job uses dynamic allocation or node labeling if (getSysProps().getBoolean(SPARK_DYNAMIC_RES_JOBTYPE_PROPERTY, Boolean.FALSE)) { getJobProps().put("env." + SPARK_DYNAMIC_RES_ENV_VAR, Boolean.TRUE.toString()); } if (getSysProps().getBoolean(SPARK_NODE_LABELING_JOBTYPE_PROPERTY, Boolean.FALSE)) { getJobProps().put("env." + SPARK_NODE_LABELING_ENV_VAR, Boolean.TRUE.toString()); } if (getSysProps().getBoolean(SPARK_AUTO_NODE_LABELING_JOBTYPE_PROPERTY, Boolean.FALSE)) { final String desiredNodeLabel = getSysProps().get(SPARK_DESIRED_NODE_LABEL_JOBTYPE_PROPERTY); if (desiredNodeLabel == null) { throw new RuntimeException(SPARK_DESIRED_NODE_LABEL_JOBTYPE_PROPERTY + " must be configured when " + SPARK_AUTO_NODE_LABELING_JOBTYPE_PROPERTY + " is set to true."); } getJobProps().put("env." + SPARK_AUTO_NODE_LABELING_ENV_VAR, Boolean.TRUE.toString()); getJobProps().put("env." + SPARK_DESIRED_NODE_LABEL_ENV_VAR, desiredNodeLabel); } if (getSysProps().getBoolean(SPARK_DYNAMIC_RES_JOBTYPE_PROPERTY, Boolean.FALSE) || getSysProps().getBoolean(SPARK_AUTO_NODE_LABELING_JOBTYPE_PROPERTY, Boolean.FALSE)) { final String minMemVcoreRatio = getSysProps().get(SPARK_MIN_MEM_VCORE_RATIO_JOBTYPE_PROPERTY); final String minMemSize = getSysProps().get(SPARK_MIN_MEM_SIZE_JOBTYPE_PROPERTY); if (minMemVcoreRatio == null || minMemSize == null) { throw new RuntimeException(SPARK_MIN_MEM_SIZE_JOBTYPE_PROPERTY + " and " + SPARK_MIN_MEM_VCORE_RATIO_JOBTYPE_PROPERTY + " must be configured."); } if (!NumberUtils.isNumber(minMemVcoreRatio)) { throw new RuntimeException(SPARK_MIN_MEM_VCORE_RATIO_JOBTYPE_PROPERTY + " is configured as " + minMemVcoreRatio + ", but it must be a number."); } if (!NumberUtils.isNumber(minMemSize)) { throw new RuntimeException(SPARK_MIN_MEM_SIZE_JOBTYPE_PROPERTY + " is configured as " + minMemSize + ", but it must be a number."); } getJobProps().put("env." + SPARK_MIN_MEM_VCORE_RATIO_ENV_VAR, minMemVcoreRatio); getJobProps().put("env." + SPARK_MIN_MEM_SIZE_ENV_VAR, minMemSize); } try { super.run(); } catch (final Throwable t) { t.printStackTrace(); getLog().error("caught error running the job"); throw new Exception(t); } finally { if (this.tokenFile != null) { HadoopJobUtils.cancelHadoopTokens(this.hadoopSecurityManager, this.userToProxy, this.tokenFile, getLog()); if (this.tokenFile.exists()) { this.tokenFile.delete(); } } } } @Override protected String getJavaClass() { return HADOOP_SECURE_SPARK_WRAPPER; } @Override protected String getJVMArguments() { String args = super.getJVMArguments(); final String typeUserGlobalJVMArgs = getJobProps().getString(HadoopJobUtils.JOBTYPE_GLOBAL_JVM_ARGS, null); if (typeUserGlobalJVMArgs != null) { args += " " + typeUserGlobalJVMArgs; } final String typeSysGlobalJVMArgs = getSysProps().getString(HadoopJobUtils.JOBTYPE_GLOBAL_JVM_ARGS, null); if (typeSysGlobalJVMArgs != null) { args += " " + typeSysGlobalJVMArgs; } final String typeUserJVMArgs = getJobProps().getString(HadoopJobUtils.JOBTYPE_JVM_ARGS, null); if (typeUserJVMArgs != null) { args += " " + typeUserJVMArgs; } final String typeSysJVMArgs = getSysProps().getString(HadoopJobUtils.JOBTYPE_JVM_ARGS, null); if (typeSysJVMArgs != null) { args += " " + typeSysJVMArgs; } final String typeUserJVMArgs2 = getJobProps().getString(HadoopJobUtils.JVM_ARGS, null); if (typeUserJVMArgs != null) { args += " " + typeUserJVMArgs2; } final String typeSysJVMArgs2 = getSysProps().getString(HadoopJobUtils.JVM_ARGS, null); if (typeSysJVMArgs != null) { args += " " + typeSysJVMArgs2; } if (this.shouldProxy) { info("Setting up secure proxy info for child process"); String secure; secure = " -D" + HadoopSecurityManager.USER_TO_PROXY + "=" + getJobProps().getString(HadoopSecurityManager.USER_TO_PROXY); final String extraToken = getSysProps().getString(HadoopSecurityManager.OBTAIN_BINARY_TOKEN, "false"); if (extraToken != null) { secure += " -D" + HadoopSecurityManager.OBTAIN_BINARY_TOKEN + "=" + extraToken; } info("Secure settings = " + secure); args += secure; } else { info("Not setting up secure proxy info for child process"); } return args; } @Override protected String getMainArguments() { // Build the main() arguments for HadoopSecureSparkWrapper, which are then // passed to spark-submit return testableGetMainArguments(this.jobProps, getWorkingDirectory(), getLog()); } @Override protected List<String> getClassPaths() { // The classpath for the process that runs HadoopSecureSparkWrapper final String pluginDir = getSysProps().get("plugin.dir"); final List<String> classPath = super.getClassPaths(); // To add az-core jar classpath classPath.add(getSourcePathFromClass(Props.class)); // To add az-common jar classpath classPath.add(getSourcePathFromClass(JavaProcessJob.class)); classPath.add(getSourcePathFromClass(HadoopSecureHiveWrapper.class)); classPath.add(getSourcePathFromClass(HadoopSecurityManager.class)); classPath.add(HadoopConfigurationInjector.getPath(getJobProps(), getWorkingDirectory())); final List<String> typeClassPath = getSysProps().getStringList("jobtype.classpath", null, ","); info("Adding jobtype.classpath: " + typeClassPath); if (typeClassPath != null) { // fill in this when load this jobtype for (final String jar : typeClassPath) { File jarFile = new File(jar); if (!jarFile.isAbsolute()) { jarFile = new File(pluginDir + File.separatorChar + jar); } if (!classPath.contains(jarFile.getAbsolutePath())) { classPath.add(jarFile.getAbsolutePath()); } } } // Decide spark home/conf and append Spark classpath for the client. final String[] sparkHomeConf = getSparkLibConf(); classPath.add(sparkHomeConf[0] + "/*"); classPath.add(sparkHomeConf[1]); final List<String> typeGlobalClassPath = getSysProps().getStringList("jobtype.global.classpath", null, ","); info("Adding jobtype.global.classpath: " + typeGlobalClassPath); if (typeGlobalClassPath != null) { for (final String jar : typeGlobalClassPath) { if (!classPath.contains(jar)) { classPath.add(jar); } } } info("Final classpath: " + classPath); return classPath; } /** * This method is used to retrieve Spark home and conf locations. Below logic is mentioned in detail. * a) If user has specified spark version in job property, e.g. spark-version=1.6.0, then * i) If spark.{sparkVersion}.home is set in commonprivate.properties/private.properties, then that will be returned. * ii) If spark.{sparkVersion}.home is not set and spark.home.dir is set then it will retrieve Spark directory inside * spark.home.dir, matching spark.home.prefix + sparkVersion pattern. * b) If user has not specified spark version in job property, use default spark.home configured in the jobtype * plugin's config * c) If spark home is not found by both of the above cases, then throw RuntimeException. * @return */ protected String[] getSparkLibConf() { String sparkHome = null; String sparkConf = null; // If user has specified version in job property. e.g. spark-version=1.6.0 final String jobSparkVer = getJobProps().get(SparkJobArg.SPARK_VERSION.azPropName); if (jobSparkVer != null) { info("This job sets spark version: " + jobSparkVer); // Spark jobtype supports this version through plugin's jobtype config sparkHome = getSparkHome(jobSparkVer); sparkConf = getSysProps().get("spark." + jobSparkVer + ".conf"); if (sparkConf == null) { sparkConf = sparkHome + "/conf"; } info("Using job specific spark: " + sparkHome + " and conf: " + sparkConf); // Override the SPARK_HOME SPARK_CONF_DIR env for HadoopSecureSparkWrapper process(spark client) getJobProps().put("env." + SPARK_HOME_ENV_VAR, sparkHome); getJobProps().put("env." + SPARK_CONF_DIR_ENV_VAR, sparkConf); } else { // User job doesn't give spark-version // Use default spark.home. Configured in the jobtype plugin's config sparkHome = getSysProps().get("spark.home"); if (sparkHome == null) { // Use system default SPARK_HOME env sparkHome = System.getenv(SPARK_HOME_ENV_VAR); } sparkConf = (System.getenv(SPARK_CONF_DIR_ENV_VAR) != null) ? System.getenv(SPARK_CONF_DIR_ENV_VAR) : (sparkHome + "/conf"); info("Using system default spark: " + sparkHome + " and conf: " + sparkConf); } if (sparkHome == null) { throw new RuntimeException("SPARK is not available on the azkaban machine."); } else { final File homeDir = new File(sparkHome); if (!homeDir.exists()) { throw new RuntimeException("SPARK home dir does not exist."); } final File confDir = new File(sparkConf); if (!confDir.exists()) { error("SPARK conf dir does not exist. Will use SPARK_HOME/conf as default."); sparkConf = sparkHome + "/conf"; } final File defaultSparkConf = new File(sparkConf + "/spark-defaults.conf"); if (!defaultSparkConf.exists()) { throw new RuntimeException("Default Spark config file spark-defaults.conf cannot" + " be found at " + defaultSparkConf); } } return new String[] { getSparkLibDir(sparkHome), sparkConf }; } /** * This method is used to get spark home from plugin's jobtype config. * If spark.{sparkVersion}.home is set in commonprivate.properties/private.properties, then that will be returned. * If spark.{sparkVersion}.home is not set and spark.base.dir is set then it will retrieve Spark directory inside * spark.base.dir, matching spark.home.prefix + sparkVersion pattern. Regex pattern can be passed as properties for * version formatting. * @param sparkVersion * @return */ private String getSparkHome(final String sparkVersion) { String sparkHome = getSysProps().get("spark." + sparkVersion + ".home"); if (sparkHome == null) { info("Couldn't find spark." + sparkVersion + ".home property."); final String sparkDir = getSysProps().get(SPARK_BASE_DIR); final String sparkHomePrefix = getSysProps().get(SPARK_HOME_PREFIX) != null ? getSysProps().get(SPARK_HOME_PREFIX) : "*"; final String replaceTo = getSysProps().get(SPARK_VERSION_REGEX_TO_REPLACE); final String replaceWith = getSysProps().get(SPARK_VERSION_REGEX_TO_REPLACE_WITH) != null ? getSysProps().get(SPARK_VERSION_REGEX_TO_REPLACE_WITH) : ""; final String versionPatterToMatch = sparkHomePrefix + (replaceTo != null ? sparkVersion.replace(replaceTo, replaceWith) : sparkVersion) + "*"; info("Looking for spark at " + sparkDir + " directory with " + sparkHomePrefix + " prefix for " + sparkVersion + " version."); final DirectoryScanner scanner = new DirectoryScanner(); scanner.setBasedir(sparkDir); scanner.setIncludes(new String[] { versionPatterToMatch }); scanner.scan(); final String[] directories = scanner.getIncludedDirectories(); if (directories != null && directories.length > 0) { sparkHome = sparkDir + "/" + directories[directories.length - 1]; } else { final String sparkReferenceDoc = getSysProps().get(SPARK_REFERENCE_DOCUMENT); final String exceptionMessage = sparkReferenceDoc == null ? "SPARK version specified by User is not available." : "SPARK version specified by User is not available. Available versions are mentioned at: " + sparkReferenceDoc; throw new RuntimeException(exceptionMessage); } } return sparkHome; } /** * Given the dir path of Spark Home, return the dir path of Spark lib. * It is either sparkHome/lib or sparkHome/jars based on the version of * Spark chosen by user. * @param sparkHome dir path of Spark Home * @return dir path of Spark lib */ private String getSparkLibDir(final String sparkHome) { // sparkHome should have already been checked when this method is invoked final File homeDir = new File(sparkHome); File libDir = new File(homeDir, "lib"); if (libDir.exists()) { return libDir.getAbsolutePath(); } else { libDir = new File(homeDir, "jars"); if (libDir.exists()) { return libDir.getAbsolutePath(); } else { throw new RuntimeException("SPARK lib dir does not exist."); } } } /** * This cancel method, in addition to the default canceling behavior, also * kills the Spark job on Hadoop */ @Override public void cancel() throws InterruptedException { super.cancel(); info("Cancel called. Killing the Spark job on the cluster"); final String azExecId = this.jobProps.getString(CommonJobProperties.EXEC_ID); final String logFilePath = String.format("%s/_job.%s.%s.log", getWorkingDirectory(), azExecId, getId()); info("log file path is: " + logFilePath); HadoopJobUtils.proxyUserKillAllSpawnedHadoopJobs(logFilePath, this.jobProps, this.tokenFile, getLog()); } }