Java tutorial
/* * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.tap.hadoop; import java.beans.ConstructorProperties; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Set; import cascading.flow.FlowProcess; import cascading.flow.hadoop.util.HadoopUtil; import cascading.scheme.Scheme; import cascading.tap.SinkMode; import cascading.tap.Tap; import cascading.tap.TapException; import cascading.tap.hadoop.io.CombineFileRecordReaderWrapper; import cascading.tap.hadoop.io.HadoopTupleEntrySchemeCollector; import cascading.tap.hadoop.io.HadoopTupleEntrySchemeIterator; import cascading.tap.type.FileType; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; import cascading.tuple.hadoop.TupleSerialization; import cascading.util.Util; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.CombineFileInputFormat; import org.apache.hadoop.mapred.lib.CombineFileRecordReader; import org.apache.hadoop.mapred.lib.CombineFileSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Class Hfs is the base class for all Hadoop file system access. Hfs may only be used with the * Hadoop {@link cascading.flow.FlowConnector} sub-classes when creating Hadoop executable {@link cascading.flow.Flow} * instances. * <p/> * Paths typically should point to a directory, where in turn all the "part" files immediately in that directory will * be included. This is the practice Hadoop expects. Sub-directories are not included and typically result in a failure. * <p/> * To include sub-directories, Hadoop supports "globing". Globing is a frustrating feature and is supported more * robustly by {@link GlobHfs} and less so by Hfs. * <p/> * Hfs will accept {@code /*} (wildcard) paths, but not all convenience methods like * {@code jobConf.getSize} will behave properly or reliably. Nor can the Hfs instance * with a wildcard path be used as a sink to write data. * <p/> * In those cases use GlobHfs since it is a sub-class of {@link cascading.tap.MultiSourceTap}. * <p/> * Optionally use {@link Dfs} or {@link Lfs} for resources specific to Hadoop Distributed file system or * the Local file system, respectively. Using Hfs is the best practice when possible, Lfs and Dfs are conveniences. * <p/> * Use the Hfs class if the 'kind' of resource is unknown at design time. To use, prefix a scheme to the 'stringPath'. Where * <code>hdfs://...</code> will denote Dfs, and <code>file://...</code> will denote Lfs. * <p/> * Call {@link HfsProps#setTemporaryDirectory(java.util.Map, String)} to use a different temporary file directory path * other than the current Hadoop default path. * <p/> * By default Cascading on Hadoop will assume any source or sink Tap using the {@code file://} URI scheme * intends to read files from the local client filesystem (for example when using the {@code Lfs} Tap) where the Hadoop * job jar is started. Subsequently Cascading will force any MapReduce jobs reading or writing to {@code file://} resources * to run in Hadoop "standalone mode" so that the file can be read. * <p/> * To change this behavior, {@link HfsProps#setLocalModeScheme(java.util.Map, String)} to set a different scheme value, * or to "none" to disable entirely for the case the file to be read is available on every Hadoop processing node * in the exact same path. * <p/> * When using a MapReduce planner, Hfs can optionally combine multiple small files (or a series of small "blocks") into * larger "splits". This reduces the number of resulting map tasks created by Hadoop and can improve application * performance. * <p/> * This is enabled by calling {@link HfsProps#setUseCombinedInput(boolean)} to {@code true}. By default, merging * or combining splits into large ones is disabled. * <p/> * Apache Tez planner does not require this setting, it is supported by default and enabled by the application manager. */ public class Hfs extends Tap<Configuration, RecordReader, OutputCollector> implements FileType<Configuration> { /** Field LOG */ private static final Logger LOG = LoggerFactory.getLogger(Hfs.class); /** Field stringPath */ protected String stringPath; /** Field uriScheme */ transient URI uriScheme; /** Field path */ transient Path path; /** Field paths */ private transient FileStatus[] statuses; // only used by getModifiedTime private transient String cachedPath = null; private static final PathFilter HIDDEN_FILES_FILTER = new PathFilter() { public boolean accept(Path path) { String name = path.getName(); if (name.isEmpty()) // should never happen return true; char first = name.charAt(0); return first != '_' && first != '.'; } }; protected static String getLocalModeScheme(Configuration conf, String defaultValue) { return conf.get(HfsProps.LOCAL_MODE_SCHEME, defaultValue); } protected static boolean getUseCombinedInput(Configuration conf) { String platform = conf.get("cascading.flow.platform", ""); boolean combineEnabled = conf.getBoolean("cascading.hadoop.hfs.combine.files", false); // only supported by these platforms if (platform.equals("hadoop") || platform.equals("hadoop2-mr1")) return combineEnabled; if (combineEnabled && !Boolean.getBoolean("cascading.hadoop.hfs.combine.files.warned")) { LOG.warn( "'cascading.hadoop.hfs.combine.files' has been set to true, but is unsupported by this platform: {}, will be ignored to prevent failures", platform); System.setProperty("cascading.hadoop.hfs.combine.files.warned", "true"); } return false; } protected static boolean getCombinedInputSafeMode(Configuration conf) { return conf.getBoolean("cascading.hadoop.hfs.combine.safemode", true); } protected Hfs() { } @ConstructorProperties({ "scheme" }) protected Hfs(Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme) { super(scheme); } /** * Constructor Hfs creates a new Hfs instance. * * @param scheme of type Scheme * @param stringPath of type String */ @ConstructorProperties({ "scheme", "stringPath" }) public Hfs(Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme, String stringPath) { super(scheme); setStringPath(stringPath); } /** * Constructor Hfs creates a new Hfs instance. * * @param scheme of type Scheme * @param stringPath of type String * @param sinkMode of type SinkMode */ @ConstructorProperties({ "scheme", "stringPath", "sinkMode" }) public Hfs(Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme, String stringPath, SinkMode sinkMode) { super(scheme, sinkMode); setStringPath(stringPath); } protected void setStringPath(String stringPath) { this.stringPath = Util.normalizeUrl(stringPath); } protected void setUriScheme(URI uriScheme) { this.uriScheme = uriScheme; } public URI getURIScheme(Configuration jobConf) { if (uriScheme != null) return uriScheme; uriScheme = makeURIScheme(jobConf); return uriScheme; } protected URI makeURIScheme(Configuration configuration) { try { URI uriScheme; LOG.debug("handling path: {}", stringPath); URI uri = new Path(stringPath).toUri(); // safer URI parsing String schemeString = uri.getScheme(); String authority = uri.getAuthority(); LOG.debug("found scheme: {}, authority: {}", schemeString, authority); if (schemeString != null && authority != null) uriScheme = new URI(schemeString + "://" + uri.getAuthority()); else if (schemeString != null) uriScheme = new URI(schemeString + ":///"); else uriScheme = getDefaultFileSystemURIScheme(configuration); LOG.debug("using uri scheme: {}", uriScheme); return uriScheme; } catch (URISyntaxException exception) { throw new TapException("could not determine scheme from path: " + getPath(), exception); } } /** * Method getDefaultFileSystemURIScheme returns the URI scheme for the default Hadoop FileSystem. * * @param configuration of type JobConf * @return URI */ public URI getDefaultFileSystemURIScheme(Configuration configuration) { return getDefaultFileSystem(configuration).getUri(); } protected FileSystem getDefaultFileSystem(Configuration configuration) { try { return FileSystem.get(configuration); } catch (IOException exception) { throw new TapException("unable to get handle to underlying filesystem", exception); } } protected FileSystem getFileSystem(Configuration configuration) { URI scheme = getURIScheme(configuration); try { return FileSystem.get(scheme, configuration); } catch (IOException exception) { throw new TapException("unable to get handle to get filesystem for: " + scheme.getScheme(), exception); } } @Override public String getIdentifier() { if (cachedPath == null) cachedPath = getPath().toString(); return cachedPath; } public Path getPath() { if (path != null) return path; if (stringPath == null) throw new IllegalStateException("path not initialized"); path = new Path(stringPath); return path; } @Override public String getFullIdentifier(Configuration conf) { return getPath().makeQualified(getFileSystem(conf)).toString(); } @Override public void sourceConfInit(FlowProcess<? extends Configuration> process, Configuration conf) { String fullIdentifier = getFullIdentifier(conf); applySourceConfInitIdentifiers(process, conf, fullIdentifier); verifyNoDuplicates(conf); } protected static void verifyNoDuplicates(Configuration conf) { Path[] inputPaths = FileInputFormat.getInputPaths(HadoopUtil.asJobConfInstance(conf)); Set<Path> paths = new HashSet<Path>((int) (inputPaths.length / .75f)); for (Path inputPath : inputPaths) { if (!paths.add(inputPath)) throw new TapException("may not add duplicate paths, found: " + inputPath); } } protected void applySourceConfInitIdentifiers(FlowProcess<? extends Configuration> process, Configuration conf, String... fullIdentifiers) { for (String fullIdentifier : fullIdentifiers) sourceConfInitAddInputPath(conf, new Path(fullIdentifier)); sourceConfInitComplete(process, conf); } protected void sourceConfInitAddInputPath(Configuration conf, Path qualifiedPath) { HadoopUtil.addInputPath(conf, qualifiedPath); makeLocal(conf, qualifiedPath, "forcing job to local mode, via source: "); } protected void sourceConfInitComplete(FlowProcess<? extends Configuration> process, Configuration conf) { super.sourceConfInit(process, conf); TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow // use CombineFileInputFormat if that is enabled handleCombineFileInputFormat(conf); } /** * Based on the configuration, handles and sets {@link CombineFileInputFormat} as the input * format. */ private void handleCombineFileInputFormat(Configuration conf) { // if combining files, override the configuration to use CombineFileInputFormat if (!getUseCombinedInput(conf)) return; // get the prescribed individual input format from the underlying scheme so it can be used by CombinedInputFormat String individualInputFormat = conf.get("mapred.input.format.class"); if (individualInputFormat == null) throw new TapException("input format is missing from the underlying scheme"); if (individualInputFormat.equals(CombinedInputFormat.class.getName()) && conf.get(CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT) == null) throw new TapException( "the input format class is already the combined input format but the underlying input format is missing"); // if safe mode is on (default) throw an exception if the InputFormat is not a FileInputFormat, otherwise log a // warning and don't use the CombineFileInputFormat boolean safeMode = getCombinedInputSafeMode(conf); if (!FileInputFormat.class.isAssignableFrom(conf.getClass("mapred.input.format.class", null))) { if (safeMode) throw new TapException( "input format must be of type org.apache.hadoop.mapred.FileInputFormat, got: " + individualInputFormat); else LOG.warn( "not combining input splits with CombineFileInputFormat, {} is not of type org.apache.hadoop.mapred.FileInputFormat.", individualInputFormat); } else { // set the underlying individual input format conf.set(CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT, individualInputFormat); // override the input format class conf.setClass("mapred.input.format.class", CombinedInputFormat.class, InputFormat.class); } } @Override public void sinkConfInit(FlowProcess<? extends Configuration> process, Configuration conf) { Path qualifiedPath = new Path(getFullIdentifier(conf)); HadoopUtil.setOutputPath(conf, qualifiedPath); super.sinkConfInit(process, conf); makeLocal(conf, qualifiedPath, "forcing job to local mode, via sink: "); TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow } private void makeLocal(Configuration conf, Path qualifiedPath, String infoMessage) { String scheme = getLocalModeScheme(conf, "file"); if (!HadoopUtil.isLocal(conf) && qualifiedPath.toUri().getScheme().equalsIgnoreCase(scheme)) { if (LOG.isInfoEnabled()) LOG.info(infoMessage + toString()); HadoopUtil.setLocal(conf); // force job to run locally } } @Override public TupleEntryIterator openForRead(FlowProcess<? extends Configuration> flowProcess, RecordReader input) throws IOException { // input may be null when this method is called on the client side or cluster side when accumulating // for a HashJoin return new HadoopTupleEntrySchemeIterator(flowProcess, this, input); } @Override public TupleEntryCollector openForWrite(FlowProcess<? extends Configuration> flowProcess, OutputCollector output) throws IOException { // output may be null when this method is called on the client side or cluster side when creating // side files with the PartitionTap return new HadoopTupleEntrySchemeCollector(flowProcess, this, output); } @Override public boolean createResource(Configuration conf) throws IOException { if (LOG.isDebugEnabled()) LOG.debug("making dirs: {}", getFullIdentifier(conf)); return getFileSystem(conf).mkdirs(getPath()); } @Override public boolean deleteResource(Configuration conf) throws IOException { String fullIdentifier = getFullIdentifier(conf); return deleteFullIdentifier(conf, fullIdentifier); } private boolean deleteFullIdentifier(Configuration conf, String fullIdentifier) throws IOException { if (LOG.isDebugEnabled()) LOG.debug("deleting: {}", fullIdentifier); Path fullPath = new Path(fullIdentifier); // do not delete the root directory if (fullPath.depth() == 0) return true; FileSystem fileSystem = getFileSystem(conf); try { return fileSystem.delete(fullPath, true); } catch (NullPointerException exception) { // hack to get around npe thrown when fs reaches root directory // removes coupling to the new aws hadoop artifacts that may not be deployed if (!(fileSystem.getClass().getSimpleName().equals("NativeS3FileSystem"))) throw exception; } return true; } public boolean deleteChildResource(FlowProcess<? extends Configuration> flowProcess, String childIdentifier) throws IOException { return deleteChildResource(flowProcess.getConfig(), childIdentifier); } public boolean deleteChildResource(Configuration conf, String childIdentifier) throws IOException { Path childPath = new Path(childIdentifier).makeQualified(getFileSystem(conf)); if (!childPath.toString().startsWith(getFullIdentifier(conf))) return false; return deleteFullIdentifier(conf, childPath.toString()); } @Override public boolean resourceExists(Configuration conf) throws IOException { // unfortunately getFileSystem( conf ).exists( getPath() ); does not account for "/*" etc // nor is there an more efficient means to test for existence FileStatus[] fileStatuses = getFileSystem(conf).globStatus(getPath()); return fileStatuses != null && fileStatuses.length > 0; } @Override public boolean isDirectory(FlowProcess<? extends Configuration> flowProcess) throws IOException { return isDirectory(flowProcess.getConfig()); } @Override public boolean isDirectory(Configuration conf) throws IOException { if (!resourceExists(conf)) return false; return getFileSystem(conf).getFileStatus(getPath()).isDir(); } @Override public long getSize(FlowProcess<? extends Configuration> flowProcess) throws IOException { return getSize(flowProcess.getConfig()); } @Override public long getSize(Configuration conf) throws IOException { if (!resourceExists(conf)) return 0; FileStatus fileStatus = getFileSystem(conf).getFileStatus(getPath()); if (fileStatus.isDir()) return 0; return getFileSystem(conf).getFileStatus(getPath()).getLen(); } /** * Method getBlockSize returns the {@code blocksize} specified by the underlying file system for this resource. * * @param flowProcess * @return long * @throws IOException when */ public long getBlockSize(FlowProcess<? extends Configuration> flowProcess) throws IOException { return getBlockSize(flowProcess.getConfig()); } /** * Method getBlockSize returns the {@code blocksize} specified by the underlying file system for this resource. * * @param conf of JobConf * @return long * @throws IOException when */ public long getBlockSize(Configuration conf) throws IOException { if (!resourceExists(conf)) return 0; FileStatus fileStatus = getFileSystem(conf).getFileStatus(getPath()); if (fileStatus.isDir()) return 0; return fileStatus.getBlockSize(); } /** * Method getReplication returns the {@code replication} specified by the underlying file system for * this resource. * * @param flowProcess * @return int * @throws IOException when */ public int getReplication(FlowProcess<? extends Configuration> flowProcess) throws IOException { return getReplication(flowProcess.getConfig()); } /** * Method getReplication returns the {@code replication} specified by the underlying file system for * this resource. * * @param conf of JobConf * @return int * @throws IOException when */ public int getReplication(Configuration conf) throws IOException { if (!resourceExists(conf)) return 0; FileStatus fileStatus = getFileSystem(conf).getFileStatus(getPath()); if (fileStatus.isDir()) return 0; return fileStatus.getReplication(); } @Override public String[] getChildIdentifiers(FlowProcess<? extends Configuration> flowProcess) throws IOException { return getChildIdentifiers(flowProcess.getConfig(), 1, false); } @Override public String[] getChildIdentifiers(Configuration conf) throws IOException { return getChildIdentifiers(conf, 1, false); } @Override public String[] getChildIdentifiers(FlowProcess<? extends Configuration> flowProcess, int depth, boolean fullyQualified) throws IOException { return getChildIdentifiers(flowProcess.getConfig(), depth, fullyQualified); } @Override public String[] getChildIdentifiers(Configuration conf, int depth, boolean fullyQualified) throws IOException { if (!resourceExists(conf)) return new String[0]; if (depth == 0 && !fullyQualified) return new String[] { getIdentifier() }; String fullIdentifier = getFullIdentifier(conf); int trim = fullyQualified ? 0 : fullIdentifier.length() + 1; Set<String> results = new LinkedHashSet<String>(); getChildPaths(conf, results, trim, new Path(fullIdentifier), depth); return results.toArray(new String[results.size()]); } private void getChildPaths(Configuration conf, Set<String> results, int trim, Path path, int depth) throws IOException { if (depth == 0) { String substring = path.toString().substring(trim); String identifier = getIdentifier(); if (identifier == null || identifier.isEmpty()) results.add(new Path(substring).toString()); else results.add(new Path(identifier, substring).toString()); return; } FileStatus[] statuses = getFileSystem(conf).listStatus(path, HIDDEN_FILES_FILTER); if (statuses == null) return; for (FileStatus fileStatus : statuses) getChildPaths(conf, results, trim, fileStatus.getPath(), depth - 1); } @Override public long getModifiedTime(Configuration conf) throws IOException { if (!resourceExists(conf)) return 0; FileStatus fileStatus = getFileSystem(conf).getFileStatus(getPath()); if (!fileStatus.isDir()) return fileStatus.getModificationTime(); // todo: this should ignore the _temporary path, or not cache if found in the array makeStatuses(conf); // statuses is empty, return 0 if (statuses == null || statuses.length == 0) return 0; long date = 0; // filter out directories as we don't recurs into sub dirs for (FileStatus status : statuses) { if (!status.isDir()) date = Math.max(date, status.getModificationTime()); } return date; } public static Path getTempPath(Configuration conf) { String tempDir = conf.get(HfsProps.TEMPORARY_DIRECTORY); if (tempDir == null) tempDir = conf.get("hadoop.tmp.dir"); return new Path(tempDir); } protected String makeTemporaryPathDirString(String name) { // _ is treated as a hidden file, so wipe them out name = name.replaceAll("^[_\\W\\s]+", ""); if (name.isEmpty()) name = "temp-path"; return name.replaceAll("[\\W\\s]+", "_") + Util.createUniqueID(); } /** * Given a file-system object, it makes an array of paths * * @param conf of type JobConf * @throws IOException on failure */ private void makeStatuses(Configuration conf) throws IOException { if (statuses != null) return; statuses = getFileSystem(conf).listStatus(getPath()); } /** * Method resetFileStatuses removes the status cache, if any. */ public void resetFileStatuses() { statuses = null; } /** Combined input format that uses the underlying individual input format to combine multiple files into a single split. */ static class CombinedInputFormat extends CombineFileInputFormat implements Configurable { private Configuration conf; public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { return new CombineFileRecordReader(job, (CombineFileSplit) split, reporter, CombineFileRecordReaderWrapper.class); } @Override public void setConf(Configuration conf) { this.conf = conf; // set the aliased property value, if zero, the super class will look up the hadoop property setMaxSplitSize(conf.getLong("cascading.hadoop.hfs.combine.max.size", 0)); } @Override public Configuration getConf() { return conf; } } }