Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.inmobi.conduit.distcp.tools; import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.Method; import java.util.Map; import java.util.Random; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobSubmissionFiles; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import com.inmobi.conduit.distcp.tools.CopyListing.DuplicateFileException; import com.inmobi.conduit.distcp.tools.CopyListing.InvalidInputException; import com.inmobi.conduit.distcp.tools.mapred.CopyMapper; import com.inmobi.conduit.distcp.tools.mapred.CopyOutputFormat; import com.inmobi.conduit.distcp.tools.util.DistCpUtils; public class DistCp extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(DistCp.class); protected DistCpOptions inputOptions; protected Path metaFolder; private static final String PREFIX = "_distcp"; private static final String WIP_PREFIX = "._WIP_"; private static final String DISTCP_DEFAULT_XML = "distcp-default.xml"; public static final Random rand = new Random(); private boolean submitted; private FileSystem jobFS; private long jobExecutionTimeInNanos; /** * Public Constructor. Creates DistCp object with specified input-parameters. * (E.g. source-paths, target-location, etc.) * @param inputOptions: Options (indicating source-paths, target-location.) * @param configuration: The Hadoop configuration against which the Copy-mapper must run. * @throws Exception, on failure. */ public DistCp(Configuration configuration, DistCpOptions inputOptions) throws Exception { Configuration config = (configuration instanceof JobConf) ? new JobConf(configuration) : new Configuration(configuration); Configuration defaultConf = new Configuration(false); defaultConf.addResource(DISTCP_DEFAULT_XML); for (Map.Entry<String, String> entry : defaultConf) if (config.get(entry.getKey()) == null) config.set(entry.getKey(), entry.getValue()); setConf(config); this.inputOptions = inputOptions; this.metaFolder = createMetaFolderPath(); } /** * To be used with the ToolRunner. Not for public consumption. */ private DistCp() { } /** * Implementation of Tool::run(). Orchestrates the copy of source file(s) * to target location, by: * 1. Creating a list of files to be copied to target. * 2. Launching a Map-only job to copy the files. (Delegates to execute().) * @param argv: List of arguments passed to DistCp, from the ToolRunner. * @return On success, it returns 0. Else, -1. */ public int run(String[] argv) { try { inputOptions = (OptionsParser.parse(argv)); LOG.info("Input Options: " + inputOptions); } catch (Throwable e) { LOG.error("Invalid arguments: ", e); System.err.println("Invalid arguments: " + e.getMessage()); OptionsParser.usage(); return DistCpConstants.INVALID_ARGUMENT; } try { execute(); } catch (InvalidInputException e) { LOG.error("Invalid input: ", e); return DistCpConstants.INVALID_ARGUMENT; } catch (DuplicateFileException e) { LOG.error("Duplicate files in input path: ", e); return DistCpConstants.DUPLICATE_INPUT; } catch (Exception e) { LOG.error("Exception encountered ", e); return DistCpConstants.UNKNOWN_ERROR; } return DistCpConstants.SUCCESS; } /** * Implements the core-execution. Creates the file-list for copy, * and launches the Hadoop-job, to do the copy. * @return Job handle * @throws Exception, on failure. */ public Job execute() throws Exception { assert inputOptions != null; assert getConf() != null; Job job = null; try { metaFolder = createMetaFolderPath(); jobFS = metaFolder.getFileSystem(getConf()); job = createJob(); createInputFileListing(job); job.submit(); submitted = true; } finally { if (!submitted) { cleanup(); } } String jobID = getJobID(job); job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID); LOG.info("DistCp job-id: " + jobID); LOG.info("DistCp job may be tracked at: " + job.getTrackingURL()); LOG.info("To cancel, run the following command:\thadoop job -kill " + jobID); long jobStartTime = System.nanoTime(); if (inputOptions.shouldBlock() && !job.waitForCompletion(true)) { updateJobTimeInNanos(jobStartTime); throw new IOException("DistCp failure: Job " + jobID + " has failed. "); } updateJobTimeInNanos(jobStartTime); return job; } public long getJobTimeInNanos() { return jobExecutionTimeInNanos; } private void updateJobTimeInNanos(long jobStartTime) { jobExecutionTimeInNanos = System.nanoTime() - jobStartTime; } /** * Create Job object for submitting it, with all the configuration * * @return Reference to job object. * @throws IOException - Exception if any */ protected Job createJob() throws IOException { String jobName = "distcp"; String userChosenName = getConf().get("mapred.job.name"); if (userChosenName != null) jobName += ": " + userChosenName; Job job = new Job(getConf(), jobName); job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions)); job.setJarByClass(CopyMapper.class); configureOutputFormat(job); job.setMapperClass(CopyMapper.class); job.setReducerClass(Reducer.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(CopyOutputFormat.class); job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false"); job.getConfiguration().set(DistCpConstants.CONF_LABEL_NUM_MAPS, String.valueOf(inputOptions.getMaxMaps())); if (inputOptions.getSslConfigurationFile() != null) { setupSSLConfig(job.getConfiguration()); } inputOptions.appendToConf(job.getConfiguration()); return job; } /** * A hack to get the DistCp job id. New Job API doesn't return the job id * * @param job - Handle to job * @return JobID */ private String getJobID(Job job) { return new Path(job.getConfiguration().get("mapreduce.job.dir")).getName(); } /** * Setup ssl configuration on the job configuration to enable hsftp access * from map job. Also copy the ssl configuration file to Distributed cache * * @param configuration - Reference to job's configruation handle * @throws java.io.IOException - Exception if unable to locate ssl config file */ private void setupSSLConfig(Configuration configuration) throws IOException { Path sslConfigPath = new Path(configuration.getResource(inputOptions.getSslConfigurationFile()).toString()); addSSLFilesToDistCache(configuration, sslConfigPath); configuration.set(DistCpConstants.CONF_LABEL_SSL_CONF, sslConfigPath.getName()); configuration.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfigPath.getName()); } /** * Add SSL files to distributed cache. Trust store, key store and ssl config xml * * @param configuration - Job configuration * @param sslConfigPath - ssl Configuration file specified through options * @throws IOException - If any */ private void addSSLFilesToDistCache(Configuration configuration, Path sslConfigPath) throws IOException { FileSystem localFS = FileSystem.getLocal(configuration); Configuration sslConf = new Configuration(false); sslConf.addResource(sslConfigPath); Path localStorePath = getLocalStorePath(sslConf, "ssl.client.truststore.location"); DistributedCache.addCacheFile(localStorePath.makeQualified(localFS).toUri(), configuration); configuration.set("ssl.client.truststore.location", localStorePath.getName()); localStorePath = getLocalStorePath(sslConf, "ssl.client.keystore.location"); DistributedCache.addCacheFile(localStorePath.makeQualified(localFS).toUri(), configuration); configuration.set("ssl.client.keystore.location", localStorePath.getName()); DistributedCache.addCacheFile(sslConfigPath.makeQualified(localFS).toUri(), configuration); } /** * Get Local Trust store/key store path * * @param sslConf - Config from SSL Client xml * @param storeKey - Key for either trust store or key store * @return - Path where the store is present * @throws IOException -If any */ private Path getLocalStorePath(Configuration sslConf, String storeKey) throws IOException { if (sslConf.get(storeKey) != null) { return new Path(sslConf.get(storeKey)); } else { throw new IOException( "Store for " + storeKey + " is not set in " + inputOptions.getSslConfigurationFile()); } } /** * Setup output format appropriately * * @param job - Job handle * @throws IOException - Exception if any */ private void configureOutputFormat(Job job) throws IOException { final Configuration configuration = job.getConfiguration(); Path targetPath = inputOptions.getTargetPath(); targetPath = targetPath.makeQualified(targetPath.getFileSystem(configuration)); if (inputOptions.shouldAtomicCommit()) { Path workDir = inputOptions.getAtomicWorkPath(); if (workDir == null) { workDir = targetPath.getParent(); } workDir = new Path(workDir, WIP_PREFIX + targetPath.getName() + rand.nextInt()); FileSystem workFS = workDir.getFileSystem(configuration); FileSystem targetFS = targetPath.getFileSystem(configuration); if (!DistCpUtils.compareFs(targetFS, workFS)) { throw new IllegalArgumentException("Work path " + workDir + " and target path " + targetPath + " are in different file system"); } CopyOutputFormat.setWorkingDirectory(job, workDir); } else { CopyOutputFormat.setWorkingDirectory(job, targetPath); } CopyOutputFormat.setCommitDirectory(job, targetPath); Path counterFilePath = inputOptions.getOutPutDirectory(); if (counterFilePath == null) { LOG.error("Output directory is null for distcp"); } else { LOG.info("DistCp output directory path: " + counterFilePath); CopyOutputFormat.setOutputPath(job, counterFilePath); } } /** * Create input listing by invoking an appropriate copy listing * implementation. Also add delegation tokens for each path * to job's credential store * * @param job - Handle to job * @return Returns the path where the copy listing is created * @throws IOException - If any */ protected Path createInputFileListing(Job job) throws IOException { Path fileListingPath = getFileListingPath(); CopyListing copyListing = CopyListing.getCopyListing(job.getConfiguration(), job.getCredentials(), inputOptions); copyListing.buildListing(fileListingPath, inputOptions); LOG.info("Number of paths considered for copy: " + copyListing.getNumberOfPaths()); LOG.info("Number of bytes considered for copy: " + copyListing.getBytesToCopy() + " (Actual number of bytes copied depends on whether any files are " + "skipped or overwritten.)"); return fileListingPath; } /** * Get default name of the copy listing file. Use the meta folder * to create the copy listing file * * @return - Path where the copy listing file has to be saved * @throws IOException - Exception if any */ protected Path getFileListingPath() throws IOException { String fileListPathStr = metaFolder + "/fileList.seq"; Path path = new Path(fileListPathStr); return new Path(path.toUri().normalize().toString()); } /** * Create a default working folder for the job, under the * job staging directory * * @return Returns the working folder information * @throws Exception - EXception if any */ private Path createMetaFolderPath() throws Exception { Configuration configuration = getConf(); Path stagingDir = getStagingPath(configuration); Path metaFolderPath = new Path(stagingDir, PREFIX + String.valueOf(rand.nextInt())); if (LOG.isDebugEnabled()) LOG.debug("Meta folder location: " + metaFolderPath); configuration.set(DistCpConstants.CONF_LABEL_META_FOLDER, metaFolderPath.toString()); return metaFolderPath; } private Path getStagingPath(Configuration configuration) { try { LOG.debug("Trying to get staging path using hadoop-2"); Class clusterClass = DistCp.class.getClassLoader().loadClass("org.apache.hadoop.mapreduce.Cluster"); Method method = JobSubmissionFiles.class.getMethod("getStagingDir", clusterClass, Configuration.class); Constructor constructor = clusterClass.getConstructor(Configuration.class); return (Path) method.invoke(null, constructor.newInstance(configuration), configuration); } catch (Exception ignored) { // fallback to hadoop-1 API } try { LOG.debug("Trying to get staging path using hadoop-1"); Method method = JobSubmissionFiles.class.getMethod("getStagingDir", JobClient.class, Configuration.class); return (Path) method.invoke(null, new JobClient(new JobConf(configuration)), configuration); } catch (Exception ignored) { // do nothing } throw new RuntimeException("Either hadoop-1 or hadoop-2 must be in the classpath"); } /** * Main function of the DistCp program. Parses the input arguments (via OptionsParser), * and invokes the DistCp::run() method, via the ToolRunner. * @param argv: Command-line arguments sent to DistCp. */ public static void main(String argv[]) { int exitCode = DistCpConstants.UNKNOWN_ERROR; try { DistCp distCp = new DistCp(); Cleanup CLEANUP = new Cleanup(distCp); Runtime.getRuntime().addShutdownHook(CLEANUP); exitCode = ToolRunner.run(getDefaultConf(), distCp, argv); } catch (Exception e) { LOG.error("Couldn't complete DistCp operation: ", e); } if (exitCode != DistCpConstants.SUCCESS) { System.exit(exitCode); } } /** * Loads properties from distcp-default.xml into configuration * object * @return Configuration which includes properties from distcp-default.xml */ private static Configuration getDefaultConf() { Configuration config = new Configuration(); // Propagate properties related to delegation tokens. String tokenFile = System.getenv("HADOOP_TOKEN_FILE_LOCATION"); if (tokenFile != null) { config.set("mapreduce.job.credentials.binary", tokenFile); } config.addResource(DISTCP_DEFAULT_XML); return config; } private synchronized void cleanup() { try { if (metaFolder == null) return; jobFS.delete(metaFolder, true); metaFolder = null; } catch (IOException e) { LOG.error("Unable to cleanup meta folder: " + metaFolder, e); } } private boolean isSubmitted() { return submitted; } private static class Cleanup extends Thread { private final DistCp distCp; public Cleanup(DistCp distCp) { this.distCp = distCp; } @Override public void run() { if (distCp.isSubmitted()) return; distCp.cleanup(); } } }