com.moz.fiji.mapreduce.DistributedCacheJars.java Source code

Java tutorial

Introduction

Here is the source code for com.moz.fiji.mapreduce.DistributedCacheJars.java

Source

/**
 * (c) Copyright 2012 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.moz.fiji.mapreduce;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.moz.fiji.annotations.ApiAudience;
import com.moz.fiji.annotations.ApiStability;

/** Utility class for dealing with Java JAR files and the hadoop distributed cache. */
@ApiAudience.Public
@ApiStability.Evolving
public final class DistributedCacheJars {
    private static final Logger LOG = LoggerFactory.getLogger(DistributedCacheJars.class);

    /**
     * Configuration variable name to store jars that export to distributed cache.
     *
     * The value associated to this variable is a comma-separated list of jar file paths.
     *
     * @see org.apache.hadoop.mapred.JobClient
     */
    private static final String CONF_TMPJARS = "tmpjars";

    /** No constructor for this utility class. */
    private DistributedCacheJars() {
    }

    /**
     * Fully qualifies a path if necessary.
     *
     * If path is not fully qualified, the default FS from the given configuration is used to
     * qualify the path.
     *
     * @param pathStr Path to qualify.
     * @param conf Configuration according to which qualification happens.
     * @return fully-qualified path.
     * @throws IOException on I/O error.
     */
    private static Path qualifiedPathFromString(String pathStr, Configuration conf) throws IOException {
        final Path path = new Path(pathStr);
        return path.getFileSystem(conf).makeQualified(path);
    }

    /**
     * Adds the jars from a directory into the distributed cache of a job.
     *
     * @param job The job to configure.
     * @param jarDirectory A path to a directory of jar files.
     * @throws IOException on I/O error.
     */
    public static void addJarsToDistributedCache(Job job, String jarDirectory) throws IOException {
        addJarsToDistributedCache(job, qualifiedPathFromString(jarDirectory, job.getConfiguration()));
    }

    /**
     * Adds the jars from a directory into the distributed cache of a job.
     *
     * @param job The job to configure.
     * @param jarDirectory A path to a directory of jar files.
     * @throws IOException on I/O error.
     */
    public static void addJarsToDistributedCache(Job job, File jarDirectory) throws IOException {
        addJarsToDistributedCache(job, new Path("file:" + jarDirectory.getCanonicalPath()));
    }

    /**
     * Adds the jars from a directory into the distributed cache of a job.
     *
     * @param job The job to configure.
     * @param jarDirectory A path to a directory of jar files.
     * @throws IOException on I/O error.
     */
    public static void addJarsToDistributedCache(Job job, Path jarDirectory) throws IOException {
        if (null == jarDirectory) {
            throw new IllegalArgumentException("Jar directory may not be null");
        }
        addJarsToDistributedCache(job, listJarFilesFromDirectory(job.getConfiguration(), jarDirectory));
    }

    /**
     * Adds the jar files into the distributed cache of a job.
     *
     * @param job The job to configure.
     * @param jarFiles Collection of jar files to add.
     * @throws IOException on I/O error.
     */
    public static void addJarsToDistributedCache(Job job, Collection<Path> jarFiles) throws IOException {
        // Get existing jars named in configuration.
        final List<Path> allJars = Lists.newArrayList(getJarsFromConfiguration(job.getConfiguration()));

        // Add jars from jarDirectory.
        for (Path path : jarFiles) {
            final Path qualifiedPath = path.getFileSystem(job.getConfiguration()).makeQualified(path);
            LOG.debug("Adding jar {}, fully qualified as {}", path, qualifiedPath);
            allJars.add(qualifiedPath);
        }

        // De-duplicate the list of jar files, based on their names:
        final Collection<Path> deDupedJars = deDuplicateFilenames(allJars);
        job.getConfiguration().set(CONF_TMPJARS, StringUtils.join(deDupedJars, ","));
    }

    /**
     * Lists all jars in the variable tmpjars of this Configuration.
     *
     * @param conf The Configuration to get jar names from
     * @return A list of jars.
     */
    public static List<Path> getJarsFromConfiguration(Configuration conf) {
        final String existingJars = conf.get(CONF_TMPJARS);
        if ((null == existingJars) || existingJars.isEmpty()) {
            return Collections.emptyList();
        }
        final List<Path> jarFiles = Lists.newArrayList();
        for (String jarPath : existingJars.split(",")) {
            jarFiles.add(new Path(jarPath));
        }
        return jarFiles;
    }

    /**
     * Lists all jars in the specified directory.
     *
     * @param conf Configuration to get FileSystem from
     * @param jarDirectory The directory of jars to get.
     * @return A list of qualified paths to the jars in jarDirectory.
     * @throws IOException if there's a problem.
     */
    public static Collection<Path> listJarFilesFromDirectory(Configuration conf, Path jarDirectory)
            throws IOException {
        LOG.debug("Listing jar files {}/*.jar", jarDirectory);
        final FileSystem fs = jarDirectory.getFileSystem(conf);
        if (!fs.isDirectory(jarDirectory)) {
            throw new IOException("Attempted to add jars from non-directory: " + jarDirectory);
        }
        final List<Path> jarFiles = Lists.newArrayList();
        for (FileStatus status : fs.listStatus(jarDirectory)) {
            if (!status.isDir() && status.getPath().getName().endsWith(".jar")) {
                jarFiles.add(fs.makeQualified(status.getPath()));
            }
        }
        return jarFiles;
    }

    /**
     * Removes files whose name are duplicated in a given collection.
     *
     * @param jarFiles Collection of .jar files to de-duplicate.
     * @return De-duplicated collection of .jar files.
     */
    public static List<Path> deDuplicateFilenames(Iterable<Path> jarFiles) {
        final Set<String> jarFileNames = Sets.newHashSet();
        final List<Path> uniqueFiles = Lists.newArrayList();
        for (Path jarFile : jarFiles) {
            if (jarFileNames.add(jarFile.getName())) {
                uniqueFiles.add(jarFile);
            }
        }
        return uniqueFiles;
    }
}