Java tutorial
/* * Copyright 2012 LinkedIn Corp. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package azkaban.jobtype; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.security.PrivilegedExceptionAction; import java.util.Collection; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Properties; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.client.api.YarnClient; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.log4j.Logger; import azkaban.security.commons.HadoopSecurityManager; import azkaban.security.commons.HadoopSecurityManagerException; import azkaban.utils.Props; /** * <pre> * There are many common methods that's required by the Hadoop*Job.java's. They are all consolidated * here. * * Methods here include getting/setting hadoop tokens, * methods for manipulating lib folder paths and jar paths passed in from Azkaban prop file, * and finally methods for helping to parse logs for application ids, * and kill the applications via Yarn (very helpful during the cancel method) * * </pre> * * * @see azkaban.jobtype.HadoopSparkJob * @see azkaban.jobtype.HadoopHiveJob * @see azkaban.jobtype.HadoopPigJob * @see azkaban.jobtype.HadoopJavaJob */ public class HadoopJobUtils { public static String MATCH_ALL_REGEX = ".*"; public static String MATCH_NONE_REGEX = ".^"; public static final String HADOOP_SECURITY_MANAGER_CLASS_PARAM = "hadoop.security.manager.class"; // the regex to look for while looking for application id's in the hadoop log public static final Pattern APPLICATION_ID_PATTERN = Pattern.compile("^(application_\\d+_\\d+).*"); // Azkaban built in property name public static final String JOBTYPE_GLOBAL_JVM_ARGS = "jobtype.global.jvm.args"; // Azkaban built in property name public static final String JOBTYPE_JVM_ARGS = "jobtype.jvm.args"; // Azkaban built in property name public static final String JVM_ARGS = "jvm.args"; /** * Invalidates a Hadoop authentication token file * * @param hadoopSecurityManager * @param userToProxy * @param tokenFile * @param log */ public static void cancelHadoopTokens(HadoopSecurityManager hadoopSecurityManager, String userToProxy, File tokenFile, Logger log) { try { hadoopSecurityManager.cancelTokens(tokenFile, userToProxy, log); } catch (HadoopSecurityManagerException e) { log.error(e.getCause() + e.getMessage()); } catch (Exception e) { log.error(e.getCause() + e.getMessage()); } } /** * Based on the HADOOP_SECURITY_MANAGER_CLASS_PARAM setting in the incoming props, finds the * correct HadoopSecurityManager Java class * * @param props * @param log * @return a HadoopSecurityManager object. Will throw exception if any errors occur (including not * finding a class) * @throws RuntimeException * : If any errors happen along the way. */ public static HadoopSecurityManager loadHadoopSecurityManager(Props props, Logger log) throws RuntimeException { Class<?> hadoopSecurityManagerClass = props.getClass(HADOOP_SECURITY_MANAGER_CLASS_PARAM, true, HadoopJobUtils.class.getClassLoader()); log.info("Loading hadoop security manager " + hadoopSecurityManagerClass.getName()); HadoopSecurityManager hadoopSecurityManager = null; try { Method getInstanceMethod = hadoopSecurityManagerClass.getMethod("getInstance", Props.class); hadoopSecurityManager = (HadoopSecurityManager) getInstanceMethod.invoke(hadoopSecurityManagerClass, props); } catch (InvocationTargetException e) { String errMsg = "Could not instantiate Hadoop Security Manager " + hadoopSecurityManagerClass.getName() + e.getCause(); log.error(errMsg); throw new RuntimeException(errMsg, e); } catch (Exception e) { throw new RuntimeException(e); } return hadoopSecurityManager; } /** * Fetching token with the Azkaban user * * @param hadoopSecurityManager * @param props * @param log * @return * @throws HadoopSecurityManagerException */ public static File getHadoopTokens(HadoopSecurityManager hadoopSecurityManager, Props props, Logger log) throws HadoopSecurityManagerException { File tokenFile = null; try { tokenFile = File.createTempFile("mr-azkaban", ".token"); } catch (Exception e) { throw new HadoopSecurityManagerException("Failed to create the token file.", e); } hadoopSecurityManager.prefetchToken(tokenFile, props, log); return tokenFile; } /** * <pre> * If there's a * specification in the "jar" argument (e.g. jar=./lib/*,./lib2/*), * this method helps to resolve the * into actual jar names inside the folder, and in order. * This is due to the requirement that Spark 1.4 doesn't seem to do the resolution for users * * </pre> * * @param unresolvedJarSpec * @return jar file list, comma separated, all .../* expanded into actual jar names in order * */ public static String resolveWildCardForJarSpec(String workingDirectory, String unresolvedJarSpec, Logger log) { log.debug("resolveWildCardForJarSpec: unresolved jar specification: " + unresolvedJarSpec); log.debug("working directory: " + workingDirectory); if (unresolvedJarSpec == null || unresolvedJarSpec.isEmpty()) return ""; StringBuilder resolvedJarSpec = new StringBuilder(); String[] unresolvedJarSpecList = unresolvedJarSpec.split(","); for (String s : unresolvedJarSpecList) { // if need resolution if (s.endsWith("*")) { // remove last 2 characters to get to the folder String dirName = String.format("%s/%s", workingDirectory, s.substring(0, s.length() - 2)); File[] jars = null; try { jars = getFilesInFolderByRegex(new File(dirName), ".*jar"); } catch (FileNotFoundException fnfe) { log.warn("folder does not exist: " + dirName); continue; } // if the folder is there, add them to the jar list for (File jar : jars) { resolvedJarSpec.append(jar.toString()).append(","); } } else { // no need for resolution resolvedJarSpec.append(s).append(","); } } log.debug("resolveWildCardForJarSpec: resolvedJarSpec: " + resolvedJarSpec); // remove the trailing comma int lastCharIndex = resolvedJarSpec.length() - 1; if (lastCharIndex >= 0 && resolvedJarSpec.charAt(lastCharIndex) == ',') { resolvedJarSpec.deleteCharAt(lastCharIndex); } return resolvedJarSpec.toString(); } /** * <pre> * This method looks for the proper user execution jar. * The user input is expected in the following 2 formats: * 1. ./lib/abc * 2. ./lib/abc.jar * * This method will use prefix matching to find any jar that is the form of abc*.jar, * so that users can bump jar versions without doing modifications to their Hadoop DSL. * * This method will throw an Exception if more than one jar that matches the prefix is found * * @param workingDirectory * @param userSpecifiedJarName * @return the resolved actual jar file name to execute */ public static String resolveExecutionJarName(String workingDirectory, String userSpecifiedJarName, Logger log) { if (log.isDebugEnabled()) { String debugMsg = String.format( "Resolving execution jar name: working directory: %s, user specified name: %s", workingDirectory, userSpecifiedJarName); log.debug(debugMsg); } // in case user decides to specify with abc.jar, instead of only abc if (userSpecifiedJarName.endsWith(".jar")) userSpecifiedJarName = userSpecifiedJarName.replace(".jar", ""); // can't use java 1.7 stuff, reverting to a slightly ugly implementation String userSpecifiedJarPath = String.format("%s/%s", workingDirectory, userSpecifiedJarName); int lastIndexOfSlash = userSpecifiedJarPath.lastIndexOf("/"); final String jarPrefix = userSpecifiedJarPath.substring(lastIndexOfSlash + 1); final String dirName = userSpecifiedJarPath.substring(0, lastIndexOfSlash); if (log.isDebugEnabled()) { String debugMsg = String.format("Resolving execution jar name: dirname: %s, jar name: %s", dirName, jarPrefix); log.debug(debugMsg); } File[] potentialExecutionJarList; try { potentialExecutionJarList = getFilesInFolderByRegex(new File(dirName), jarPrefix + ".*jar"); } catch (FileNotFoundException e) { throw new IllegalStateException( "execution jar is suppose to be in this folder, but the folder doesn't exist: " + dirName); } if (potentialExecutionJarList.length == 0) { throw new IllegalStateException( "unable to find execution jar for Spark at path: " + userSpecifiedJarPath + "*.jar"); } else if (potentialExecutionJarList.length > 1) { throw new IllegalStateException( "I find more than one matching instance of the execution jar at the path, don't know which one to use: " + userSpecifiedJarPath + "*.jar"); } String resolvedJarName = potentialExecutionJarList[0].toString(); log.info("Resolving execution jar name: resolvedJarName: " + resolvedJarName); return resolvedJarName; } /** * * @return a list of files in the given folder that matches the regex. It may be empty, but will * never return a null * @throws FileNotFoundException */ private static File[] getFilesInFolderByRegex(File folder, final String regex) throws FileNotFoundException { // sanity check if (!folder.exists()) { throw new FileNotFoundException(); } if (!folder.isDirectory()) { throw new IllegalStateException( "execution jar is suppose to be in this folder, but the object present is not a directory: " + folder); } File[] matchingFiles = folder.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { if (name.matches(regex)) return true; else return false; } }); if (matchingFiles == null) { throw new IllegalStateException( "the File[] matchingFiles variable is null. This means an IOException occured while doing listFiles. Please check disk availability and retry again"); } return matchingFiles; } /** * This method is a decorator around the KillAllSpawnedHadoopJobs method. * This method takes additional parameters to determine whether KillAllSpawnedHadoopJobs needs to be executed * using doAs as a different user * * @param logFilePath Azkaban log file path * @param jobProps Azkaban job props * @param tokenFile Pass in the tokenFile if value is known. It is ok to skip if the token file is in the environmental variable * @param log a usable logger */ public static void proxyUserKillAllSpawnedHadoopJobs(final String logFilePath, Props jobProps, File tokenFile, final Logger log) { Properties properties = new Properties(); properties.putAll(jobProps.getFlattened()); try { if (HadoopSecureWrapperUtils.shouldProxy(properties)) { UserGroupInformation proxyUser = HadoopSecureWrapperUtils.setupProxyUser(properties, tokenFile.getAbsolutePath(), log); proxyUser.doAs(new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { HadoopJobUtils.killAllSpawnedHadoopJobs(logFilePath, log); return null; } }); } else { HadoopJobUtils.killAllSpawnedHadoopJobs(logFilePath, log); } } catch (Throwable t) { log.warn("something happened while trying to kill all spawned jobs", t); } } /** * Pass in a log file, this method will find all the hadoop jobs it has launched, and kills it * * Only works with Hadoop2 * * @param logFilePath * @param log * @return a Set<String>. The set will contain the applicationIds that this job tried to kill. */ public static Set<String> killAllSpawnedHadoopJobs(String logFilePath, Logger log) { Set<String> allSpawnedJobs = findApplicationIdFromLog(logFilePath, log); log.info("applicationIds to kill: " + allSpawnedJobs); for (String appId : allSpawnedJobs) { try { killJobOnCluster(appId, log); } catch (Throwable t) { log.warn("something happened while trying to kill this job: " + appId, t); } } return allSpawnedJobs; } /** * <pre> * Takes in a log file, will grep every line to look for the application_id pattern. * If it finds multiple, it will return all of them, de-duped (this is possible in the case of pig jobs) * This can be used in conjunction with the @killJobOnCluster method in this file. * </pre> * * @param logFilePath * @return a Set. May be empty, but will never be null */ public static Set<String> findApplicationIdFromLog(String logFilePath, Logger log) { File logFile = new File(logFilePath); if (!logFile.exists()) { throw new IllegalArgumentException("the logFilePath does not exist: " + logFilePath); } if (!logFile.isFile()) { throw new IllegalArgumentException("the logFilePath specified is not a valid file: " + logFilePath); } if (!logFile.canRead()) { throw new IllegalArgumentException("unable to read the logFilePath specified: " + logFilePath); } BufferedReader br = null; Set<String> applicationIds = new HashSet<String>(); try { br = new BufferedReader(new FileReader(logFile)); String line; // finds all the application IDs while ((line = br.readLine()) != null) { String[] inputs = line.split("\\s"); if (inputs != null) { for (String input : inputs) { Matcher m = APPLICATION_ID_PATTERN.matcher(input); if (m.find()) { String appId = m.group(1); applicationIds.add(appId); } } } } } catch (IOException e) { log.error("Error while trying to find applicationId for log", e); } finally { try { if (br != null) br.close(); } catch (Exception e) { // do nothing } } return applicationIds; } /** * <pre> * Uses YarnClient to kill the job on HDFS. * Using JobClient only works partially: * If yarn container has started but spark job haven't, it will kill * If spark job has started, the cancel will hang until the spark job is complete * If the spark job is complete, it will return immediately, with a job not found on job tracker * </pre> * * @param applicationId * @throws IOException * @throws YarnException */ public static void killJobOnCluster(String applicationId, Logger log) throws YarnException, IOException { YarnConfiguration yarnConf = new YarnConfiguration(); YarnClient yarnClient = YarnClient.createYarnClient(); yarnClient.init(yarnConf); yarnClient.start(); String[] split = applicationId.split("_"); ApplicationId aid = ApplicationId.newInstance(Long.parseLong(split[1]), Integer.parseInt(split[2])); log.info("start klling application: " + aid); yarnClient.killApplication(aid); log.info("successfully killed application: " + aid); } /** * <pre> * constructions a javaOpts string based on the Props, and the key given, will return * String.format("-D%s=%s", key, value); * </pre> * * @param props * @param key * @return will return String.format("-D%s=%s", key, value). Throws RuntimeException if props not * present */ public static String javaOptStringFromAzkabanProps(Props props, String key) { String value = props.get(key); if (value == null) { throw new RuntimeException( String.format("Cannot find property [%s], in azkaban props: [%s]", key, value)); } return String.format("-D%s=%s", key, value); } /** * Filter a collection of String commands to match a whitelist regex and not match a blacklist * regex. * * @param commands * Collection of commands to be filtered * @param whitelistRegex * whitelist regex to work as inclusion criteria * @param blacklistRegex * blacklist regex to work as exclusion criteria * @param log * logger to report violation * @return filtered list of matching. Empty list if no command match all the criteria. */ public static List<String> filterCommands(Collection<String> commands, String whitelistRegex, String blacklistRegex, Logger log) { List<String> filteredCommands = new LinkedList<String>(); Pattern whitelistPattern = Pattern.compile(whitelistRegex); Pattern blacklistPattern = Pattern.compile(blacklistRegex); for (String command : commands) { if (whitelistPattern.matcher(command).matches() && !blacklistPattern.matcher(command).matches()) { filteredCommands.add(command); } else { log.warn(String.format("Removing restricted command: %s", command)); } } return filteredCommands; } /** * <pre> * constructions a javaOpts string based on the Props, and the key given, will return * String.format("-D%s=%s", key, value); * </pre> * * @param conf * @param key * @return will return String.format("-D%s=%s", key, value). Throws RuntimeException if props not * present */ public static String javaOptStringFromHadoopConfiguration(Configuration conf, String key) { String value = conf.get(key); if (value == null) { throw new RuntimeException( String.format("Cannot find property [%s], in Hadoop configuration: [%s]", key, value)); } return String.format("-D%s=%s", key, value); } }