com.cloudera.cdk.tools.JobClasspathHelper.java Source code

Introduction

Here is the source code for com.cloudera.cdk.tools.JobClasspathHelper.java
Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.tools;

import com.google.common.io.Closeables;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Charsets;

/**
 * <p>
 * This class is an helper to copy the jars needed by the job in the Distributed cache.
 * </p>
 * 
 * <p>
 * This tool helps to setup the job classpath at runtime. It allows library sharing between job. That result in faster
 * jobs setup (since most of the time the libs are already uploaded in HDFS). Before submitting a job, you use this tool
 * to provide the classes that you use in your job.
 * </p>
 * 
 * <p>
 * The tool will find the jar(s), or will create the jars and upload them to a "library" path in HDFS, and it will
 * create an md5 file along the uploaded jar.
 * </p>
 * 
 * <p>
 * In order to find the jar or creating the job's Jar It use a modified version of org.apache.hadoop.util.JarFinder that
 * is found in Hadoop 0.23
 * </p>
 * 
 * <p>
 * If another job needs the same jar and provide the same "library" path it will discover it and use it, without having
 * to lose the time that the upload of the jar would require.
 * </p>
 * 
 * <p>
 * If the jar does not exist in the "library" path, it will upload it. However, if the jar is already in the "library"
 * path, the tool will compute the md5 of the jar and compare with the one found in HDFS, and if there's a difference,
 * the jar will be uploaded.
 * </p>
 * 
 * <p>
 * If it creates a jar (from the classes of the job itself or from the classes in your workspace for example), it will
 * upload the created jar to the "library" path and clean them after the JVM exits.
 * </p>
 * 
 * <p>
 * Here's an example for a job class TestTool.class that requires HashFunction from Guava.
 * </p>
 * 
 * <pre>
 * {@code
 * new JobClasspathHelper().prepareClasspath(getConf(), new Path("/lib/path"), new Class[] { TestTool.class, HashFunction.class});
 * }
 * </pre>
 * 
 * @author tbussier (tony.bussieres@ticksmith.com)
 * @since 0.3.0
 * 
 * 
 */
public class JobClasspathHelper {

    private static final Logger logger = LoggerFactory.getLogger(JobClasspathHelper.class);

    /**
     * 
     * @param conf
     *            Configuration object for the Job. Used to get the FileSystem associated with it.
     * @param libDir
     *            Destination directory in the FileSystem (Usually HDFS) where to upload and look for the libs.
     * @param classesToInclude
     *            Classes that are needed by the job. JarFinder will look for the jar containing these classes.
     * @throws Exception
     */
    public void prepareClasspath(final Configuration conf, final Path libDir, Class<?>... classesToInclude)
            throws Exception {
        FileSystem fs = null;
        List<Class<?>> classList = new ArrayList<Class<?>>(Arrays.asList(classesToInclude));
        fs = FileSystem.get(conf);
        Map<String, String> jarMd5Map = new TreeMap<String, String>();
        // for each classes we use JarFinder to locate the jar in the local classpath.
        for (Class<?> clz : classList) {
            if (clz != null) {
                String localJarPath = JarFinder.getJar(clz);
                // we don't want to upload the same jar twice
                if (!jarMd5Map.containsKey(localJarPath)) {
                    // We should not push core Hadoop classes with this tool.
                    // Should it be the responsibility of the developer or we let
                    // this fence here?
                    if (!clz.getName().startsWith("org.apache.hadoop.")) {
                        // we compute the MD5 sum of the local jar
                        InputStream in = new FileInputStream(localJarPath);
                        boolean threw = true;
                        try {
                            String md5sum = DigestUtils.md5Hex(in);
                            jarMd5Map.put(localJarPath, md5sum);
                            threw = false;
                        } finally {
                            Closeables.close(in, threw);
                        }
                    } else {
                        logger.info("Ignoring {}, since it looks like it's from Hadoop's core libs", localJarPath);
                    }
                }
            }
        }

        for (Entry<String, String> entry : jarMd5Map.entrySet()) {
            Path localJarPath = new Path(entry.getKey());
            String jarFilename = localJarPath.getName();
            String localMd5sum = entry.getValue();
            logger.info("Jar {}. MD5 : [{}]", localJarPath, localMd5sum);

            Path remoteJarPath = new Path(libDir, jarFilename);
            Path remoteMd5Path = new Path(libDir, jarFilename + ".md5");

            // If the jar file does not exist in HDFS or if the MD5 file does not exist in HDFS,
            // we force the upload of the jar.
            if (!fs.exists(remoteJarPath) || !fs.exists(remoteMd5Path)) {
                copyJarToHDFS(fs, localJarPath, localMd5sum, remoteJarPath, remoteMd5Path);
            } else {
                // If the jar exist,we validate the MD5 file.
                // If the MD5 sum is different, we upload the jar
                FSDataInputStream md5FileStream = null;

                String remoteMd5sum = "";
                try {
                    md5FileStream = fs.open(remoteMd5Path);
                    byte[] md5bytes = new byte[32];
                    if (32 == md5FileStream.read(md5bytes)) {
                        remoteMd5sum = new String(md5bytes, Charsets.UTF_8);
                    }
                } finally {
                    if (md5FileStream != null) {
                        md5FileStream.close();
                    }
                }

                if (localMd5sum.equals(remoteMd5sum)) {
                    logger.info("Jar {} already exists [{}] and md5sum are equals", jarFilename,
                            remoteJarPath.toUri().toASCIIString());
                } else {
                    logger.info("Jar {} already exists [{}] and md5sum are different!", jarFilename,
                            remoteJarPath.toUri().toASCIIString());
                    copyJarToHDFS(fs, localJarPath, localMd5sum, remoteJarPath, remoteMd5Path);
                }

            }
            // In all case we want to add the jar to the DistributedCache's classpath
            DistributedCache.addFileToClassPath(remoteJarPath, conf, fs);
        }
        // and we create the symlink (was necessary in earlier versions of Hadoop)
        DistributedCache.createSymlink(conf);
    }

    /**
     * @param fs
     *            File system where to upload the jar.
     * @param localJarPath
     *            The local path where we find the jar.
     * @param md5sum
     *            The MD5 sum of the local jar.
     * @param remoteJarPath
     *            The remote path where to upload the jar.
     * @param remoteMd5Path
     *            The remote path where to create the MD5 file.
     * 
     * @throws IOException
     */
    private void copyJarToHDFS(FileSystem fs, Path localJarPath, String md5sum, Path remoteJarPath,
            Path remoteMd5Path) throws IOException {

        logger.info("Copying {} to {}", localJarPath.toUri().toASCIIString(),
                remoteJarPath.toUri().toASCIIString());
        fs.copyFromLocalFile(localJarPath, remoteJarPath);
        // create the MD5 file for this jar.
        createMd5SumFile(fs, md5sum, remoteMd5Path);

        // we need to clean the tmp files that are are created by JarFinder after the JVM exits.
        if (remoteJarPath.getName().startsWith(JarFinder.TMP_HADOOP)) {
            fs.deleteOnExit(remoteJarPath);
        }
        // same for the MD5 file.
        if (remoteMd5Path.getName().startsWith(JarFinder.TMP_HADOOP)) {
            fs.deleteOnExit(remoteMd5Path);
        }
    }

    /**
     * This method creates an file that contains a line with a MD5 sum
     * 
     * @param fs
     *            FileSystem where to create the file.
     * @param md5sum
     *            The string containing the MD5 sum.
     * @param remoteMd5Path
     *            The path where to save the file.
     * @throws IOException
     */
    private void createMd5SumFile(FileSystem fs, String md5sum, Path remoteMd5Path) throws IOException {
        FSDataOutputStream os = null;
        try {
            os = fs.create(remoteMd5Path, true);
            os.writeBytes(md5sum);
            os.flush();
        } catch (Exception e) {
            logger.error("{}", e);
        } finally {
            if (os != null) {
                os.close();
            }
        }
    };

}