Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap.hadoop; import java.io.IOException; import java.net.URI; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import cascading.tap.Tap; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; public class Hadoop18TapUtil { private static final Logger LOG = Logger.getLogger(Hadoop18TapUtil.class); private static final String TEMPORARY_PATH = "_temporary"; private static Map<String, AtomicInteger> pathCounts = new HashMap<String, AtomicInteger>(); /** * should only be called if not in a Flow * * @param conf * @throws IOException */ public static void setupJob(JobConf conf) throws IOException { Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath == null) return; if (getFSSafe(conf, outputPath) == null) return; if (conf.get("mapred.task.id") == null) // need to stuff a fake id { String mapper = conf.getBoolean("mapred.task.is.map", true) ? "m" : "r"; conf.set("mapred.task.id", String.format("attempt_%012d_0000_%s_000000_0", (int) Math.rint(System.currentTimeMillis()), mapper)); } makeTempPath(conf); if (writeDirectlyToWorkingPath(conf, outputPath)) { LOG.info("writing directly to output path: " + outputPath); setWorkOutputPath(conf, outputPath); return; } // "mapred.work.output.dir" Path taskOutputPath = getTaskOutputPath(conf); setWorkOutputPath(conf, taskOutputPath); } static synchronized void setupTask(JobConf conf) throws IOException { String workpath = conf.get("mapred.work.output.dir"); if (workpath == null) return; FileSystem fs = getFSSafe(conf, new Path(workpath)); if (fs == null) return; String taskId = conf.get("mapred.task.id"); LOG.info("setting up task: '" + taskId + "' - " + workpath); AtomicInteger integer = pathCounts.get(workpath); if (integer == null) { integer = new AtomicInteger(); pathCounts.put(workpath, integer); } integer.incrementAndGet(); } public static boolean needsTaskCommit(JobConf conf) throws IOException { String workpath = conf.get("mapred.work.output.dir"); if (workpath == null) return false; Path taskOutputPath = new Path(workpath); if (taskOutputPath != null) { FileSystem fs = getFSSafe(conf, taskOutputPath); if (fs == null) return false; if (fs.exists(taskOutputPath)) return true; } return false; } /** * copies all files from the taskoutputpath to the outputpath * * @param conf */ public static void commitTask(JobConf conf) throws IOException { Path taskOutputPath = new Path(conf.get("mapred.work.output.dir")); FileSystem fs = getFSSafe(conf, taskOutputPath); if (fs == null) return; AtomicInteger integer = pathCounts.get(taskOutputPath.toString()); if (integer.decrementAndGet() != 0) return; String taskId = conf.get("mapred.task.id"); LOG.info("committing task: '" + taskId + "' - " + taskOutputPath); if (taskOutputPath != null) { if (writeDirectlyToWorkingPath(conf, taskOutputPath)) return; if (fs.exists(taskOutputPath)) { Path jobOutputPath = taskOutputPath.getParent().getParent(); // Move the task outputs to their final place moveTaskOutputs(conf, fs, jobOutputPath, taskOutputPath); // Delete the temporary task-specific output directory if (!fs.delete(taskOutputPath, true)) LOG.info("failed to delete the temporary output directory of task: '" + taskId + "' - " + taskOutputPath); LOG.info("saved output of task '" + taskId + "' to " + jobOutputPath); } } } /** * Called from flow step to remove temp dirs * * @param conf * @throws IOException */ public static void cleanupTap(JobConf conf, Tap tap) throws IOException { cleanTempPath(conf, tap.getPath()); } /** * May only be called once. should only be called if not in a flow * * @param conf */ static void cleanupJob(JobConf conf) throws IOException { if (isInflow(conf)) return; Path outputPath = FileOutputFormat.getOutputPath(conf); cleanTempPath(conf, outputPath); } private static synchronized void cleanTempPath(JobConf conf, Path outputPath) throws IOException { // do the clean up of temporary directory if (outputPath != null) { FileSystem fileSys = getFSSafe(conf, outputPath); if (fileSys == null) return; if (!fileSys.exists(outputPath)) return; Path tmpDir = new Path(outputPath, TEMPORARY_PATH); LOG.info("deleting temp path " + tmpDir); if (fileSys.exists(tmpDir)) fileSys.delete(tmpDir, true); } } private static FileSystem getFSSafe(JobConf conf, Path tmpDir) { try { return tmpDir.getFileSystem(conf); } catch (IOException e) { // ignore } return null; } static boolean isInflow(JobConf conf) { return conf.get("cascading.flow.step") != null; } private static Path getTaskOutputPath(JobConf conf) { String taskId = conf.get("mapred.task.id"); Path p = new Path(FileOutputFormat.getOutputPath(conf), TEMPORARY_PATH + Path.SEPARATOR + "_" + taskId); try { FileSystem fs = p.getFileSystem(conf); return p.makeQualified(fs); } catch (IOException ie) { return p; } } static void setWorkOutputPath(JobConf conf, Path outputDir) { outputDir = new Path(conf.getWorkingDirectory(), outputDir); conf.set("mapred.work.output.dir", outputDir.toString()); } public static void makeTempPath(JobConf conf) throws IOException { // create job specific temporary directory in output path Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath != null) { Path tmpDir = new Path(outputPath, TEMPORARY_PATH); FileSystem fileSys = tmpDir.getFileSystem(conf); if (!fileSys.exists(tmpDir) && !fileSys.mkdirs(tmpDir)) { LOG.error("mkdirs failed to create " + tmpDir.toString()); } } } private static void moveTaskOutputs(JobConf conf, FileSystem fs, Path jobOutputDir, Path taskOutput) throws IOException { String taskId = conf.get("mapred.task.id"); if (fs.isFile(taskOutput)) { Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf)); if (!fs.rename(taskOutput, finalOutputPath)) { if (!fs.delete(finalOutputPath, true)) { throw new IOException("Failed to delete earlier output of task: " + taskId); } if (!fs.rename(taskOutput, finalOutputPath)) { throw new IOException("Failed to save output of task: " + taskId); } } LOG.debug("Moved " + taskOutput + " to " + finalOutputPath); } else if (fs.getFileStatus(taskOutput).isDir()) { FileStatus[] paths = fs.listStatus(taskOutput); Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf)); fs.mkdirs(finalOutputPath); if (paths != null) { for (FileStatus path : paths) { moveTaskOutputs(conf, fs, jobOutputDir, path.getPath()); } } } } private static Path getFinalPath(Path jobOutputDir, Path taskOutput, Path taskOutputPath) throws IOException { URI taskOutputUri = taskOutput.toUri(); URI relativePath = taskOutputPath.toUri().relativize(taskOutputUri); if (taskOutputUri == relativePath) {//taskOutputPath is not a parent of taskOutput throw new IOException( "Can not get the relative path: base = " + taskOutputPath + " child = " + taskOutput); } if (relativePath.getPath().length() > 0) { return new Path(jobOutputDir, relativePath.getPath()); } else { return jobOutputDir; } } /** used in AWS EMR to disable temp paths on some file systems, s3. */ private static boolean writeDirectlyToWorkingPath(JobConf conf, Path path) { FileSystem fs = getFSSafe(conf, path); if (fs == null) return false; boolean result = conf.getBoolean("mapred.output.direct." + fs.getClass().getSimpleName(), false); if (result) LOG.info("output direct is enabled for this fs: " + fs.getName()); return result; } }