Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap; import java.beans.ConstructorProperties; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.Map; import cascading.flow.hadoop.HadoopUtil; import cascading.scheme.Scheme; import cascading.scheme.SequenceFile; import cascading.tap.hadoop.TapCollector; import cascading.tap.hadoop.TapIterator; import cascading.tuple.Fields; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; import cascading.tuple.hadoop.TupleSerialization; import cascading.util.Util; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.s3native.NativeS3FileSystem; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; /** * Class Hfs is the base class for all Hadoop file system access. Use {@link Dfs}, {@link Lfs}, or {@link S3fs} * for resources specific to Hadoop Distributed file system, the Local file system, or Amazon S3, respectively. * <p/> * Use the Hfs class if the 'kind' of resource is unknown at design time. To use, prefix a scheme to the 'stringPath'. Where * <code>hdfs://...</code> will denonte Dfs, <code>file://...</code> will denote Lfs, and * <code>s3://aws_id:aws_secret@bucket/...</code> will denote S3fs. * <p/> * Call {@link #setTemporaryDirectory(java.util.Map, String)} to use a different temporary file directory path * other than the current Hadoop default path. */ public class Hfs extends Tap { /** Field LOG */ private static final Logger LOG = Logger.getLogger(Hfs.class); /** Field serialVersionUID */ private static final long serialVersionUID = 1L; /** Field TEMPORARY_DIRECTORY */ private static final String TEMPORARY_DIRECTORY = "cascading.tmp.dir"; /** Field stringPath */ String stringPath; /** Field uriScheme */ transient URI uriScheme; /** Field path */ transient Path path; /** Field paths */ private transient FileStatus[] statuses; /** * Method setTemporaryDirectory sets the temporary directory on the given properties object. * * @param properties of type Map<Object,Object> * @param tempDir of type String */ public static void setTemporaryDirectory(Map<Object, Object> properties, String tempDir) { properties.put(TEMPORARY_DIRECTORY, tempDir); } /** * Methdo getTemporaryDirectory returns the configured temporary directory from the given properties object. * * @param properties of type Map<Object,Object> * @return a String or null if not set */ public static String getTemporaryDirectory(Map<Object, Object> properties) { return (String) properties.get(TEMPORARY_DIRECTORY); } protected Hfs() { } @ConstructorProperties({ "scheme" }) protected Hfs(Scheme scheme) { super(scheme); } /** * Constructor Hfs creates a new Hfs instance. * * @param fields of type Fields * @param stringPath of type String */ @ConstructorProperties({ "fields", "stringPath" }) public Hfs(Fields fields, String stringPath) { super(new SequenceFile(fields)); setStringPath(stringPath); } /** * Constructor Hfs creates a new Hfs instance. * * @param fields of type Fields * @param stringPath of type String * @param replace of type boolean */ @ConstructorProperties({ "fields", "stringPath", "replace" }) public Hfs(Fields fields, String stringPath, boolean replace) { super(new SequenceFile(fields), replace ? SinkMode.REPLACE : SinkMode.KEEP); setStringPath(stringPath); } /** * Constructor Hfs creates a new Hfs instance. * * @param fields of type Fields * @param stringPath of type String * @param sinkMode of type SinkMode */ @ConstructorProperties({ "fields", "stringPath", "sinkMode" }) public Hfs(Fields fields, String stringPath, SinkMode sinkMode) { super(new SequenceFile(fields), sinkMode); setStringPath(stringPath); if (sinkMode == SinkMode.UPDATE) throw new IllegalArgumentException("updates are not supported"); } /** * Constructor Hfs creates a new Hfs instance. * * @param scheme of type Scheme * @param stringPath of type String */ @ConstructorProperties({ "scheme", "stringPath" }) public Hfs(Scheme scheme, String stringPath) { super(scheme); setStringPath(stringPath); } /** * Constructor Hfs creates a new Hfs instance. * * @param scheme of type Scheme * @param stringPath of type String * @param replace of type boolean */ @ConstructorProperties({ "scheme", "stringPath", "replace" }) public Hfs(Scheme scheme, String stringPath, boolean replace) { super(scheme, replace ? SinkMode.REPLACE : SinkMode.KEEP); setStringPath(stringPath); } /** * Constructor Hfs creates a new Hfs instance. * * @param scheme of type Scheme * @param stringPath of type String * @param sinkMode of type SinkMode */ @ConstructorProperties({ "scheme", "stringPath", "sinkMode" }) public Hfs(Scheme scheme, String stringPath, SinkMode sinkMode) { super(scheme, sinkMode); setStringPath(stringPath); } protected void setStringPath(String stringPath) { this.stringPath = Util.normalizeUrl(stringPath); } protected void setUriScheme(URI uriScheme) { this.uriScheme = uriScheme; } public URI getURIScheme(JobConf jobConf) throws IOException { if (uriScheme != null) return uriScheme; uriScheme = makeURIScheme(jobConf); return uriScheme; } protected URI makeURIScheme(JobConf jobConf) throws IOException { try { URI uriScheme = null; if (LOG.isDebugEnabled()) LOG.debug("handling path: " + stringPath); URI uri = new URI(stringPath); String schemeString = uri.getScheme(); String authority = uri.getAuthority(); if (LOG.isDebugEnabled()) { LOG.debug("found scheme: " + schemeString); LOG.debug("found authority: " + authority); } if (schemeString != null && authority != null) uriScheme = new URI(schemeString + "://" + uri.getAuthority()); else if (schemeString != null) uriScheme = new URI(schemeString + ":///"); else uriScheme = getDefaultFileSystemURIScheme(jobConf); if (LOG.isDebugEnabled()) LOG.debug("using uri scheme: " + uriScheme); return uriScheme; } catch (URISyntaxException exception) { throw new TapException("could not determine scheme from path: " + getPath(), exception); } } /** * Method getDefaultFileSystemURIScheme returns the URI scheme for the default Hadoop FileSystem. * * @param jobConf of type JobConf * @return URI * @throws IOException when */ public URI getDefaultFileSystemURIScheme(JobConf jobConf) throws IOException { return getDefaultFileSystem(jobConf).getUri(); } @Override public boolean isWriteDirect() { return super.isWriteDirect() || stringPath != null && stringPath.matches("(^https?://.*$)|(^s3tp://.*$)"); } protected FileSystem getDefaultFileSystem(JobConf jobConf) throws IOException { return FileSystem.get(jobConf); } protected FileSystem getFileSystem(JobConf jobConf) throws IOException { return FileSystem.get(getURIScheme(jobConf), jobConf); } /** @see Tap#getPath() */ @Override public Path getPath() { if (path != null) return path; if (stringPath == null) throw new IllegalStateException("path not initialized"); path = new Path(stringPath); return path; } @Override public Path getQualifiedPath(JobConf conf) throws IOException { return getPath().makeQualified(getFileSystem(conf)); } @Override public void sourceInit(JobConf conf) throws IOException { Path qualifiedPath = getQualifiedPath(conf); for (Path exitingPath : FileInputFormat.getInputPaths(conf)) { if (exitingPath.equals(qualifiedPath)) throw new TapException("may not add duplicate paths, found: " + exitingPath); } FileInputFormat.addInputPath(conf, qualifiedPath); super.sourceInit(conf); makeLocal(conf, qualifiedPath, "forcing job to local mode, via source: "); TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow } @Override public void sinkInit(JobConf conf) throws IOException { // do not delete if initialized from within a task if (isReplace() && conf.get("mapred.task.partition") == null) deletePath(conf); Path qualifiedPath = getQualifiedPath(conf); FileOutputFormat.setOutputPath(conf, qualifiedPath); super.sinkInit(conf); makeLocal(conf, qualifiedPath, "forcing job to local mode, via sink: "); TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow } private void makeLocal(JobConf conf, Path qualifiedPath, String infoMessage) { if (!conf.get("mapred.job.tracker", "").equalsIgnoreCase("local") && qualifiedPath.toUri().getScheme().equalsIgnoreCase("file")) { if (LOG.isInfoEnabled()) LOG.info(infoMessage + toString()); conf.set("mapred.job.tracker", "local"); // force job to run locally } } @Override public boolean makeDirs(JobConf conf) throws IOException { if (LOG.isDebugEnabled()) LOG.debug("making dirs: " + getQualifiedPath(conf)); return getFileSystem(conf).mkdirs(getPath()); } @Override public boolean deletePath(JobConf conf) throws IOException { if (LOG.isDebugEnabled()) LOG.debug("deleting: " + getQualifiedPath(conf)); // do not delete the root directory if (getQualifiedPath(conf).depth() == 0) return true; FileSystem fileSystem = getFileSystem(conf); try { return fileSystem.delete(getPath(), true); } catch (NullPointerException exception) { // hack to get around npe thrown when fs reaches root directory if (!(fileSystem instanceof NativeS3FileSystem)) throw exception; } return true; } @Override public boolean pathExists(JobConf conf) throws IOException { return getFileSystem(conf).exists(getPath()); } @Override public long getPathModified(JobConf conf) throws IOException { FileStatus fileStatus = getFileSystem(conf).getFileStatus(getPath()); if (!fileStatus.isDir()) return fileStatus.getModificationTime(); makeStatuses(conf); // statuses is empty, return 0 if (statuses == null || statuses.length == 0) return 0; long date = 0; // filter out directories as we don't recurse into sub dirs for (FileStatus status : statuses) { if (!status.isDir()) date = Math.max(date, status.getModificationTime()); } return date; } protected Path getTempPath(JobConf conf) { String tempDir = conf.get(TEMPORARY_DIRECTORY); if (tempDir == null) tempDir = conf.get("hadoop.tmp.dir"); return new Path(tempDir); } protected String makeTemporaryPathDir(String name) { // _ is treated as a hidden file, so wipe them out name = name.replaceAll("^[_\\W\\s]+", ""); if (name.isEmpty()) name = "temp-path"; return name.replaceAll("[\\W\\s]+", "_") + Integer.toString((int) (10000000 * Math.random())); } /** * Given a file-system object, it makes an array of paths * * @param conf of type JobConf * @throws IOException on failure */ private void makeStatuses(JobConf conf) throws IOException { if (statuses != null) return; statuses = getFileSystem(conf).listStatus(getPath()); } /** @see Object#toString() */ @Override public String toString() { if (stringPath != null) return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[\"" + Util.sanitizeUrl(stringPath) + "\"]"; // sanitize else return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[not initialized]"; } /** @see Tap#equals(Object) */ @Override public boolean equals(Object object) { if (this == object) return true; if (object == null || getClass() != object.getClass()) return false; if (!super.equals(object)) return false; Hfs hfs = (Hfs) object; if (stringPath != null ? !stringPath.equals(hfs.stringPath) : hfs.stringPath != null) return false; return true; } /** @see Tap#hashCode() */ @Override public int hashCode() { int result = super.hashCode(); result = 31 * result + (stringPath != null ? stringPath.hashCode() : 0); return result; } public TupleEntryIterator openForRead(JobConf conf) throws IOException { Map<Object, Object> properties = HadoopUtil.createProperties(conf); properties.remove("mapred.input.dir"); conf = HadoopUtil.createJobConf(properties, null); return new TupleEntryIterator(getSourceFields(), new TapIterator(this, conf)); } public TupleEntryCollector openForWrite(JobConf conf) throws IOException { return new TapCollector(this, conf); } }