Java tutorial
/* * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.flow.hadoop.util; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.lang.reflect.Constructor; import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; import java.net.URI; import java.net.URL; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.jar.Attributes; import java.util.jar.Manifest; import cascading.CascadingException; import cascading.flow.FlowException; import cascading.flow.planner.BaseFlowStep; import cascading.flow.planner.PlatformInfo; import cascading.flow.planner.Scope; import cascading.pipe.Group; import cascading.scheme.hadoop.TextLine; import cascading.tap.hadoop.Hfs; import cascading.tuple.Fields; import cascading.util.LogUtil; import cascading.util.Util; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static cascading.util.Util.invokeInstanceMethod; /** * */ public class HadoopUtil { public static final String CASCADING_FLOW_EXECUTING = "cascading.flow.executing"; private static final Logger LOG = LoggerFactory.getLogger(HadoopUtil.class); private static final String ENCODING = "US-ASCII"; private static final Class<?> DEFAULT_OBJECT_SERIALIZER = JavaObjectSerializer.class; private static PlatformInfo platformInfo; public static void setIsInflow(Configuration conf) { conf.setBoolean(CASCADING_FLOW_EXECUTING, true); } public static boolean isInflow(Configuration conf) { return conf.getBoolean(CASCADING_FLOW_EXECUTING, false); } public static void initLog4j(JobConf configuration) { initLog4j((Configuration) configuration); } public static void initLog4j(Configuration configuration) { String values = configuration.get("log4j.logger", null); if (values == null || values.length() == 0) return; if (!Util.hasClass("org.apache.log4j.Logger")) { LOG.info( "org.apache.log4j.Logger is not in the current CLASSPATH, not setting log4j.logger properties"); return; } String[] elements = values.split(","); for (String element : elements) LogUtil.setLog4jLevel(element.split("=")); } // only place JobConf should ever be returned public static JobConf asJobConfInstance(Configuration configuration) { if (configuration instanceof JobConf) return (JobConf) configuration; return new JobConf(configuration); } public static <C> C copyJobConf(C parentJobConf) { return copyConfiguration(parentJobConf); } public static JobConf copyJobConf(JobConf parentJobConf) { if (parentJobConf == null) throw new IllegalArgumentException("parent may not be null"); // see https://github.com/Cascading/cascading/pull/21 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in // case those Credentials are mutated later on down the road (which they will be, during job submission, in // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. final Configuration configurationCopy = new Configuration(parentJobConf); final JobConf jobConf = new JobConf(configurationCopy); jobConf.getCredentials().addAll(parentJobConf.getCredentials()); return jobConf; } public static JobConf createJobConf(Map<Object, Object> properties, JobConf defaultJobconf) { JobConf jobConf = defaultJobconf == null ? new JobConf() : copyJobConf(defaultJobconf); if (properties == null) return jobConf; return copyConfiguration(properties, jobConf); } public static <C> C copyConfiguration(C parent) { if (parent == null) throw new IllegalArgumentException("parent may not be null"); if (!(parent instanceof Configuration)) throw new IllegalArgumentException("parent must be of type Configuration"); Configuration conf = (Configuration) parent; // see https://github.com/Cascading/cascading/pull/21 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in // case those Credentials are mutated later on down the road (which they will be, during job submission, in // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. Configuration configurationCopy = new Configuration(conf); Configuration copiedConf = callCopyConstructor(parent.getClass(), configurationCopy); if (Util.hasInstanceMethod(parent, "getCredentials", null)) { Object result = invokeInstanceMethod(parent, "getCredentials", null, null); Object credentials = invokeInstanceMethod(copiedConf, "getCredentials", null, null); invokeInstanceMethod(credentials, "addAll", new Object[] { result }, new Class[] { credentials.getClass() }); } return (C) copiedConf; } protected static <C extends Configuration> C callCopyConstructor(Class type, Configuration parent) { try { Constructor<C> constructor = type.getConstructor(parent.getClass()); return constructor.newInstance(parent); } catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException exception) { throw new CascadingException("unable to create copy of: " + type); } } public static <C extends Configuration> C copyConfiguration(Map<Object, Object> srcProperties, C dstConfiguration) { Set<Object> keys = new HashSet<Object>(srcProperties.keySet()); // keys will only be grabbed if both key/value are String, so keep orig keys if (srcProperties instanceof Properties) keys.addAll(((Properties) srcProperties).stringPropertyNames()); for (Object key : keys) { Object value = srcProperties.get(key); if (value == null && srcProperties instanceof Properties && key instanceof String) value = ((Properties) srcProperties).getProperty((String) key); if (value == null) // don't stuff null values continue; // don't let these objects pass, even though toString is called below. if (value instanceof Class || value instanceof JobConf) continue; dstConfiguration.set(key.toString(), value.toString()); } return dstConfiguration; } public static Map<Object, Object> createProperties(Configuration jobConf) { Map<Object, Object> properties = new HashMap<Object, Object>(); if (jobConf == null) return properties; for (Map.Entry<String, String> entry : jobConf) properties.put(entry.getKey(), entry.getValue()); return properties; } public static Thread getHDFSShutdownHook() { Exception caughtException; try { // we must init the FS so the finalizer is registered FileSystem.getLocal(new JobConf()); Field field = FileSystem.class.getDeclaredField("clientFinalizer"); field.setAccessible(true); Thread finalizer = (Thread) field.get(null); if (finalizer != null) Runtime.getRuntime().removeShutdownHook(finalizer); return finalizer; } catch (NoSuchFieldException exception) { caughtException = exception; } catch (IllegalAccessException exception) { caughtException = exception; } catch (IOException exception) { caughtException = exception; } LOG.debug("unable to find and remove client hdfs shutdown hook, received exception: {}", caughtException.getClass().getName()); return null; } public static String encodeBytes(byte[] bytes) { try { return new String(Base64.encodeBase64(bytes), ENCODING); } catch (UnsupportedEncodingException exception) { throw new RuntimeException(exception); } } public static byte[] decodeBytes(String string) { try { byte[] bytes = string.getBytes(ENCODING); return Base64.decodeBase64(bytes); } catch (UnsupportedEncodingException exception) { throw new RuntimeException(exception); } } public static <T> ObjectSerializer instantiateSerializer(Configuration conf, Class<T> type) throws ClassNotFoundException { Class<ObjectSerializer> flowSerializerClass; String serializerClassName = conf.get(ObjectSerializer.OBJECT_SERIALIZER_PROPERTY); if (serializerClassName == null || serializerClassName.length() == 0) flowSerializerClass = (Class<ObjectSerializer>) DEFAULT_OBJECT_SERIALIZER; else flowSerializerClass = (Class<ObjectSerializer>) Class.forName(serializerClassName); ObjectSerializer objectSerializer; try { objectSerializer = flowSerializerClass.newInstance(); if (objectSerializer instanceof Configurable) ((Configurable) objectSerializer).setConf(conf); } catch (Exception exception) { exception.printStackTrace(); throw new IllegalArgumentException("Unable to instantiate serializer \"" + flowSerializerClass.getName() + "\" for class: " + type.getName()); } if (!objectSerializer.accepts(type)) throw new IllegalArgumentException( serializerClassName + " won't accept objects of class " + type.toString()); return objectSerializer; } public static <T> String serializeBase64(T object, Configuration conf) throws IOException { return serializeBase64(object, conf, true); } public static <T> String serializeBase64(T object, Configuration conf, boolean compress) throws IOException { ObjectSerializer objectSerializer; try { objectSerializer = instantiateSerializer(conf, object.getClass()); } catch (ClassNotFoundException exception) { throw new IOException(exception); } return encodeBytes(objectSerializer.serialize(object, compress)); } /** * This method deserializes the Base64 encoded String into an Object instance. * * @param string * @return an Object */ public static <T> T deserializeBase64(String string, Configuration conf, Class<T> type) throws IOException { return deserializeBase64(string, conf, type, true); } public static <T> T deserializeBase64(String string, Configuration conf, Class<T> type, boolean decompress) throws IOException { if (string == null || string.length() == 0) return null; ObjectSerializer objectSerializer; try { objectSerializer = instantiateSerializer(conf, type); } catch (ClassNotFoundException exception) { throw new IOException(exception); } return objectSerializer.deserialize(decodeBytes(string), type, decompress); } public static Class findMainClass(Class defaultType) { return Util.findMainClass(defaultType, "org.apache.hadoop"); } public static Map<String, String> getConfig(Configuration defaultConf, Configuration updatedConf) { Map<String, String> configs = new HashMap<String, String>(); for (Map.Entry<String, String> entry : updatedConf) configs.put(entry.getKey(), entry.getValue()); for (Map.Entry<String, String> entry : defaultConf) { if (entry.getValue() == null) continue; String updatedValue = configs.get(entry.getKey()); // if both null, lets purge from map to save space if (updatedValue == null && entry.getValue() == null) configs.remove(entry.getKey()); // if the values are the same, lets also purge from map to save space if (updatedValue != null && updatedValue.equals(entry.getValue())) configs.remove(entry.getKey()); configs.remove("mapred.working.dir"); configs.remove("mapreduce.job.working.dir"); // hadoop2 } return configs; } public static JobConf[] getJobConfs(Configuration job, List<Map<String, String>> configs) { JobConf[] jobConfs = new JobConf[configs.size()]; for (int i = 0; i < jobConfs.length; i++) jobConfs[i] = (JobConf) mergeConf(job, configs.get(i), false); return jobConfs; } public static <J extends Configuration> J mergeConf(J job, Map<String, String> config, boolean directly) { Configuration currentConf = directly ? job : (job instanceof JobConf ? copyJobConf((JobConf) job) : new Configuration(job)); for (String key : config.keySet()) { LOG.debug("merging key: {} value: {}", key, config.get(key)); currentConf.set(key, config.get(key)); } return (J) currentConf; } public static Configuration removePropertiesFrom(Configuration jobConf, String... keys) { Map<Object, Object> properties = createProperties(jobConf); for (String key : keys) properties.remove(key); return copyConfiguration(properties, new JobConf()); } public static boolean removeStateFromDistCache(Configuration conf, String path) throws IOException { return new Hfs(new TextLine(), path).deleteResource(conf); } public static PlatformInfo getPlatformInfo() { if (platformInfo == null) platformInfo = getPlatformInfoInternal(); return platformInfo; } private static PlatformInfo getPlatformInfoInternal() { URL url = JobConf.class.getResource(JobConf.class.getSimpleName() + ".class"); if (url == null || !url.toString().startsWith("jar")) return new PlatformInfo("Hadoop", null, null); String path = url.toString(); String manifestPath = path.substring(0, path.lastIndexOf("!") + 1) + "/META-INF/MANIFEST.MF"; Manifest manifest; try { manifest = new Manifest(new URL(manifestPath).openStream()); } catch (IOException exception) { LOG.warn("unable to get manifest from {}", manifestPath, exception); return new PlatformInfo("Hadoop", null, null); } Attributes attributes = manifest.getAttributes("org/apache/hadoop"); if (attributes == null) { LOG.debug("unable to get Hadoop manifest attributes"); return new PlatformInfo("Hadoop", null, null); } String vendor = attributes.getValue("Implementation-Vendor"); String version = attributes.getValue("Implementation-Version"); return new PlatformInfo("Hadoop", vendor, version); } /** * Add to class path. * * @param config the config * @param classpath the classpath */ public static Map<Path, Path> addToClassPath(Configuration config, List<String> classpath) { if (classpath == null) return null; // given to fully qualified Map<String, Path> localPaths = new HashMap<String, Path>(); Map<String, Path> remotePaths = new HashMap<String, Path>(); resolvePaths(config, classpath, null, null, localPaths, remotePaths); try { LocalFileSystem localFS = getLocalFS(config); for (String path : localPaths.keySet()) { // only add local if no remote if (remotePaths.containsKey(path)) continue; Path artifact = localPaths.get(path); DistributedCache.addFileToClassPath(artifact.makeQualified(localFS), config); } FileSystem defaultFS = getDefaultFS(config); for (String path : remotePaths.keySet()) { // always add remote Path artifact = remotePaths.get(path); DistributedCache.addFileToClassPath(artifact.makeQualified(defaultFS), config); } } catch (IOException exception) { throw new FlowException("unable to set distributed cache paths", exception); } return getCommonPaths(localPaths, remotePaths); } /** * Copies paths from one local path to a remote path. If syncTimes is true, both modification and access time are * changed to match the local 'from' path. * <p/> * Returns a map of file-name to remote modification times if the remote time is different than the local time. * * @param config * @param commonPaths * @param syncTimes */ public static Map<String, Long> syncPaths(Configuration config, Map<Path, Path> commonPaths, boolean syncTimes) { if (commonPaths == null) return Collections.emptyMap(); Map<String, Long> timestampMap = new HashMap<>(); Map<Path, Path> copyPaths = getCopyPaths(config, commonPaths); // tests remote file existence or if stale LocalFileSystem localFS = getLocalFS(config); FileSystem remoteFS = getDefaultFS(config); for (Map.Entry<Path, Path> entry : copyPaths.entrySet()) { Path localPath = entry.getKey(); Path remotePath = entry.getValue(); try { LOG.info("copying from: {}, to: {}", localPath, remotePath); remoteFS.copyFromLocalFile(localPath, remotePath); if (!syncTimes) { timestampMap.put(remotePath.getName(), remoteFS.getFileStatus(remotePath).getModificationTime()); continue; } } catch (IOException exception) { throw new FlowException("unable to copy local: " + localPath + " to remote: " + remotePath, exception); } FileStatus localFileStatus = null; try { // sync the modified times so we can lazily upload jars to hdfs after job is started // otherwise modified time will be local to hdfs localFileStatus = localFS.getFileStatus(localPath); remoteFS.setTimes(remotePath, localFileStatus.getModificationTime(), -1); // don't set the access time } catch (IOException exception) { LOG.info( "unable to set local modification time on remote file: {}, 'dfs.namenode.accesstime.precision' may be set to 0 on HDFS.", remotePath); if (localFileStatus != null) timestampMap.put(remotePath.getName(), localFileStatus.getModificationTime()); } } return timestampMap; } public static Map<Path, Path> getCommonPaths(Map<String, Path> localPaths, Map<String, Path> remotePaths) { Map<Path, Path> commonPaths = new HashMap<Path, Path>(); for (Map.Entry<String, Path> entry : localPaths.entrySet()) { if (remotePaths.containsKey(entry.getKey())) commonPaths.put(entry.getValue(), remotePaths.get(entry.getKey())); } return commonPaths; } private static Map<Path, Path> getCopyPaths(Configuration config, Map<Path, Path> commonPaths) { Map<Path, Path> copyPaths = new HashMap<Path, Path>(); FileSystem remoteFS = getDefaultFS(config); FileSystem localFS = getLocalFS(config); for (Map.Entry<Path, Path> entry : commonPaths.entrySet()) { Path localPath = entry.getKey(); Path remotePath = entry.getValue(); try { boolean localExists = localFS.exists(localPath); boolean remoteExist = remoteFS.exists(remotePath); if (localExists && !remoteExist) { copyPaths.put(localPath, remotePath); } else if (localExists) { long localModTime = localFS.getFileStatus(localPath).getModificationTime(); long remoteModTime = remoteFS.getFileStatus(remotePath).getModificationTime(); if (localModTime > remoteModTime) copyPaths.put(localPath, remotePath); } } catch (IOException exception) { throw new FlowException("unable to get handle to underlying filesystem", exception); } } return copyPaths; } public static void resolvePaths(Configuration config, Collection<String> classpath, String remoteRoot, String resourceSubPath, Map<String, Path> localPaths, Map<String, Path> remotePaths) { FileSystem defaultFS = getDefaultFS(config); FileSystem localFS = getLocalFS(config); Path remoteRootPath = new Path(remoteRoot == null ? "./.staging" : remoteRoot); if (resourceSubPath != null) remoteRootPath = new Path(remoteRootPath, resourceSubPath); remoteRootPath = defaultFS.makeQualified(remoteRootPath); boolean defaultIsLocal = defaultFS.equals(localFS); for (String stringPath : classpath) { Path path = new Path(stringPath); URI uri = path.toUri(); if (uri.getScheme() == null && !defaultIsLocal) // we want to sync { Path localPath = localFS.makeQualified(path); if (!exists(localFS, localPath)) throw new FlowException("path not found: " + localPath); String name = localPath.getName(); if (resourceSubPath != null) name = resourceSubPath + "/" + name; localPaths.put(name, localPath); remotePaths.put(name, defaultFS.makeQualified(new Path(remoteRootPath, path.getName()))); } else if (localFS.equals(getFileSystem(config, path))) { if (!exists(localFS, path)) throw new FlowException("path not found: " + path); Path localPath = localFS.makeQualified(path); String name = localPath.getName(); if (resourceSubPath != null) name = resourceSubPath + "/" + name; localPaths.put(name, localPath); } else { if (!exists(defaultFS, path)) throw new FlowException("path not found: " + path); Path defaultPath = defaultFS.makeQualified(path); String name = defaultPath.getName(); if (resourceSubPath != null) name = resourceSubPath + "/" + name; remotePaths.put(name, defaultPath); } } } private static boolean exists(FileSystem fileSystem, Path path) { try { return fileSystem.exists(path); } catch (IOException exception) { throw new FlowException("could not test file exists: " + path); } } private static FileSystem getFileSystem(Configuration config, Path path) { try { return path.getFileSystem(config); } catch (IOException exception) { throw new FlowException("unable to get handle to underlying filesystem", exception); } } public static LocalFileSystem getLocalFS(Configuration config) { try { return FileSystem.getLocal(config); } catch (IOException exception) { throw new FlowException("unable to get handle to underlying filesystem", exception); } } public static FileSystem getDefaultFS(Configuration config) { try { return FileSystem.get(config); } catch (IOException exception) { throw new FlowException("unable to get handle to underlying filesystem", exception); } } public static boolean isLocal(Configuration conf) { // hadoop 1.0 and 2.0 use different properties to define local mode: we check the new YARN // property first String frameworkName = conf.get("mapreduce.framework.name"); // we are running on hadoop 2.0 (YARN) if (frameworkName != null) return frameworkName.equals("local"); // for Tez String tezLocal = conf.get("tez.local.mode"); if (tezLocal != null) return tezLocal.equals("true"); // hadoop 1.0: use the old property to determine the local mode return conf.get("mapred.job.tracker").equals("local"); } public static boolean isYARN(Configuration conf) { return conf.get("mapreduce.framework.name") != null; } public static void setLocal(Configuration conf) { // set both properties to local conf.set("mapred.job.tracker", "local"); // yarn conf.set("mapreduce.framework.name", "local"); // tez conf.set("tez.local.mode", "true"); conf.set("tez.runtime.optimize.local.fetch", "true"); } public static boolean setNewApi(Configuration conf, String className) { if (className == null) // silently return and let the error be caught downstream return false; boolean isStable = className.startsWith("org.apache.hadoop.mapred."); boolean isNew = className.startsWith("org.apache.hadoop.mapreduce."); if (isStable) conf.setBoolean("mapred.mapper.new-api", false); else if (isNew) conf.setBoolean("mapred.mapper.new-api", true); else throw new IllegalStateException( "cannot determine if class denotes stable or new api, please set 'mapred.mapper.new-api' to the appropriate value"); return true; } public static void addInputPath(Configuration conf, Path path) { Path workingDirectory = getWorkingDirectory(conf); path = new Path(workingDirectory, path); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get("mapred.input.dir"); conf.set("mapred.input.dir", dirs == null ? dirStr : dirs + StringUtils.COMMA_STR + dirStr); } public static void setOutputPath(Configuration conf, Path path) { Path workingDirectory = getWorkingDirectory(conf); path = new Path(workingDirectory, path); conf.set("mapred.output.dir", path.toString()); } private static Path getWorkingDirectory(Configuration conf) { String name = conf.get("mapred.working.dir"); if (name != null) { return new Path(name); } else { try { Path dir = FileSystem.get(conf).getWorkingDirectory(); conf.set("mapred.working.dir", dir.toString()); return dir; } catch (IOException e) { throw new RuntimeException(e); } } } public static Path getOutputPath(Configuration conf) { String name = conf.get("mapred.output.dir"); return name == null ? null : new Path(name); } public static String pack(Object object, Configuration conf) { if (object == null) return ""; try { return serializeBase64(object, conf, true); } catch (IOException exception) { throw new FlowException("unable to pack object: " + object.getClass().getCanonicalName(), exception); } } public static void addComparators(Configuration conf, String property, Map<String, Fields> map, BaseFlowStep flowStep, Group group) { Iterator<Fields> fieldsIterator = map.values().iterator(); if (!fieldsIterator.hasNext()) return; Fields fields = fieldsIterator.next(); if (fields.hasComparators()) { conf.set(property, pack(fields, conf)); return; } // use resolved fields if there are no comparators. Set<Scope> previousScopes = flowStep.getPreviousScopes(group); fields = previousScopes.iterator().next().getOutValuesFields(); if (fields.size() != 0) // allows fields.UNKNOWN to be used conf.setInt(property + ".size", fields.size()); } }