org.apache.blur.mapreduce.lib.BlurMapReduceUtil.java Source code

Introduction

Here is the source code for org.apache.blur.mapreduce.lib.BlurMapReduceUtil.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.blur.mapreduce.lib;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.blur.log.Log;
import org.apache.blur.log.LogFactory;
import org.apache.blur.lucene.security.DocumentVisibility;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.StringUtils;

/**
 * This utility code was taken from HBase to locate classes and the jars files
 * to add to the MapReduce Job.
 */
public class BlurMapReduceUtil {

    private final static Log LOG = LogFactory.getLog(BlurMapReduceUtil.class);

    /**
     * Add the Blur dependency jars as well as jars for any of the configured job
     * classes to the job configuration, so that JobClient will ship them to the
     * cluster and add them to the DistributedCache.
     */
    public static void addDependencyJars(Job job) throws IOException {
        try {
            addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class,
                    job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(),
                    job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(),
                    job.getPartitionerClass(), job.getCombinerClass(), DocumentVisibility.class);
            addAllJarsInBlurLib(job.getConfiguration());
        } catch (ClassNotFoundException e) {
            throw new IOException(e);
        }
    }

    /**
     * Adds all the jars in the same path as the blur jar files.
     * 
     * @param conf
     * @throws IOException
     */
    public static void addAllJarsInBlurLib(Configuration conf) throws IOException {
        FileSystem localFs = FileSystem.getLocal(conf);
        Set<String> jars = new HashSet<String>();
        jars.addAll(conf.getStringCollection("tmpjars"));

        String property = System.getProperty("java.class.path");
        String[] files = property.split("\\:");

        String blurLibPath = getPath("blur-", files);
        if (blurLibPath == null) {
            return;
        }
        List<String> pathes = getPathes(blurLibPath, files);
        for (String pathStr : pathes) {
            Path path = new Path(pathStr);
            if (!localFs.exists(path)) {
                LOG.warn("Could not validate jar file " + path);
                continue;
            }
            jars.add(path.makeQualified(localFs.getUri(), localFs.getWorkingDirectory()).toString());
        }
        if (jars.isEmpty()) {
            return;
        }
        conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[0])));
    }

    private static List<String> getPathes(String path, String[] files) {
        List<String> pathes = new ArrayList<String>();
        for (String file : files) {
            if (file.startsWith(path)) {
                pathes.add(file);
            }
        }
        return pathes;
    }

    private static String getPath(String startsWith, String[] files) {
        for (String file : files) {
            int lastIndexOf = file.lastIndexOf('/');
            String fileName = file.substring(lastIndexOf + 1);
            if (fileName.startsWith(startsWith)) {
                return file.substring(0, lastIndexOf);
            }
        }
        return null;
    }

    /**
     * Add the jars containing the given classes to the job's configuration such
     * that JobClient will ship them to the cluster and add them to the
     * DistributedCache.
     */
    public static void addDependencyJars(Configuration conf, Class<?>... classes) throws IOException {
        FileSystem localFs = FileSystem.getLocal(conf);
        Set<String> jars = new HashSet<String>();
        // Add jars that are already in the tmpjars variable
        jars.addAll(conf.getStringCollection("tmpjars"));

        // Add jars containing the specified classes
        for (Class<?> clazz : classes) {
            if (clazz == null) {
                continue;
            }

            String pathStr = findOrCreateJar(clazz);
            if (pathStr == null) {
                LOG.warn("Could not find jar for class " + clazz + " in order to ship it to the cluster.");
                continue;
            }
            Path path = new Path(pathStr);
            if (!localFs.exists(path)) {
                LOG.warn("Could not validate jar file " + path + " for class " + clazz);
                continue;
            }
            jars.add(path.makeQualified(localFs.getUri(), localFs.getWorkingDirectory()).toString());
        }
        if (jars.isEmpty()) {
            return;
        }

        conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[0])));
    }

    /**
     * If org.apache.hadoop.util.JarFinder is available (0.23+ hadoop), finds the
     * Jar for a class or creates it if it doesn't exist. If the class is in a
     * directory in the classpath, it creates a Jar on the fly with the contents
     * of the directory and returns the path to that Jar. If a Jar is created, it
     * is created in the system temporary directory.
     * 
     * Otherwise, returns an existing jar that contains a class of the same name.
     * 
     * @param my_class
     *          the class to find.
     * @return a jar file that contains the class, or null.
     * @throws IOException
     */
    private static String findOrCreateJar(Class<?> my_class) throws IOException {
        try {
            Class<?> jarFinder = Class.forName("org.apache.hadoop.util.JarFinder");
            // hadoop-0.23 has a JarFinder class that will create the jar
            // if it doesn't exist. Note that this is needed to run the mapreduce
            // unit tests post-0.23, because mapreduce v2 requires the relevant jars
            // to be in the mr cluster to do output, split, etc. At unit test time,
            // the hbase jars do not exist, so we need to create some. Note that we
            // can safely fall back to findContainingJars for pre-0.23 mapreduce.
            Method m = jarFinder.getMethod("getJar", Class.class);
            return (String) m.invoke(null, my_class);
        } catch (InvocationTargetException ite) {
            // function was properly called, but threw it's own exception
            throw new IOException(ite.getCause());
        } catch (Exception e) {
            // ignore all other exceptions. related to reflection failure
        }

        LOG.debug("New JarFinder: org.apache.hadoop.util.JarFinder.getJar "
                + "not available.  Using old findContainingJar");
        return findContainingJar(my_class);
    }

    /**
     * Find a jar that contains a class of the same name, if any. It will return a
     * jar file, even if that is not the first thing on the class path that has a
     * class with the same name.
     * 
     * This is shamelessly copied from JobConf
     * 
     * @param my_class
     *          the class to find.
     * @return a jar file that contains the class, or null.
     * @throws IOException
     */
    private static String findContainingJar(Class<?> my_class) {
        ClassLoader loader = my_class.getClassLoader();
        String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
        try {
            for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
                URL url = itr.nextElement();
                if ("jar".equals(url.getProtocol())) {
                    String toReturn = url.getPath();
                    if (toReturn.startsWith("file:")) {
                        toReturn = toReturn.substring("file:".length());
                    }
                    // URLDecoder is a misnamed class, since it actually decodes
                    // x-www-form-urlencoded MIME type rather than actual
                    // URL encoding (which the file path has). Therefore it would
                    // decode +s to ' 's which is incorrect (spaces are actually
                    // either unencoded or encoded as "%20"). Replace +s first, so
                    // that they are kept sacred during the decoding process.
                    toReturn = toReturn.replaceAll("\\+", "%2B");
                    toReturn = URLDecoder.decode(toReturn, "UTF-8");
                    return toReturn.replaceAll("!.*$", "");
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return null;
    }
}