be.uantwerpen.adrem.hadoop.util.Tools.java Source code

Introduction

Here is the source code for be.uantwerpen.adrem.hadoop.util.Tools.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package be.uantwerpen.adrem.hadoop.util;

import static be.uantwerpen.adrem.util.FIMOptions.OUTPUT_DIR_KEY;
import static com.google.common.collect.Sets.newHashSet;

import java.io.IOException;
import java.util.Collection;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;

import be.uantwerpen.adrem.util.FIMOptions;

/**
 * Some extra utility functions for Hadoop.
 */
public class Tools {

    /**
     * Cleans the Hadoop file system by deleting the specified files if they exist.
     * 
     * @param files
     *          the files to delete
     */
    public static void cleanDirs(String... files) {
        System.out.println("[Cleaning]: Cleaning HDFS");
        Configuration conf = new Configuration();
        for (String filename : files) {
            System.out.println("[Cleaning]: Trying to delete " + filename);
            Path path = new Path(filename);
            try {
                FileSystem fs = path.getFileSystem(conf);
                if (fs.exists(path)) {
                    if (fs.delete(path, true)) {
                        System.out.println("[Cleaning]: Deleted " + filename);
                    } else {
                        System.out.println("[Cleaning]: Error while deleting " + filename);
                    }
                } else {
                    System.out.println("[Cleaning]: " + filename + " does not exist on HDFS");
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    @SuppressWarnings("rawtypes")
    public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
            Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
            Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
            Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
            Class<? extends OutputFormat> outputFormat) throws IOException {
        Job job = new Job(new Configuration());

        Configuration jobConf = job.getConfiguration();

        if (reducer.equals(Reducer.class)) {
            if (mapper.equals(Mapper.class)) {
                throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
            }
            job.setJarByClass(mapper);
        } else {
            job.setJarByClass(reducer);
        }

        job.setInputFormatClass(inputFormat);
        jobConf.set("mapred.input.dir", inputPath.toString());

        job.setMapperClass(mapper);
        if (mapperKey != null) {
            job.setMapOutputKeyClass(mapperKey);
        }
        if (mapperValue != null) {
            job.setMapOutputValueClass(mapperValue);
        }

        jobConf.setBoolean("mapred.compress.map.output", true);

        job.setReducerClass(reducer);
        job.setOutputKeyClass(reducerKey);
        job.setOutputValueClass(reducerValue);

        job.setOutputFormatClass(outputFormat);
        jobConf.set("mapred.output.dir", outputPath.toString());

        return job;
    }

    public static String getJobAbsoluteOutputDir(@SuppressWarnings("rawtypes") Context context) {
        try {
            Path path = new Path(context.getConfiguration().get(OUTPUT_DIR_KEY));
            FileSystem fs = path.getFileSystem(context.getConfiguration());
            return fs.getFileStatus(path).getPath().toString();
        } catch (IOException e) {
        }
        return "";
    }

    public static String createPath(String... parts) {
        StringBuilder path = new StringBuilder();
        for (String part : parts) {
            path.append(part);
            path.append(Path.SEPARATOR);
        }
        return path.substring(0, path.length() - 1);
    }

    public static void cleanupAfterJob(FIMOptions opt) {
        if (!opt.debug) {
            cleanupSubdirsExcept(opt.outputDir, newHashSet("fis", "shortfis"));
        }
    }

    public static void cleanupSubdirsExcept(String dir, Collection<String> toKeep) {
        Path path = new Path(dir);
        try {
            for (FileStatus fs : path.getFileSystem(new Configuration()).listStatus(path)) {
                String[] sp = fs.getPath().toString().split(Path.SEPARATOR);
                String filename = sp[sp.length - 1];
                if (toKeep.contains(filename)) {
                    cleanDirs(fs.getPath().toString() + Path.SEPARATOR + "_SUCCESS");
                    continue;
                }
                cleanDirs(fs.getPath().toString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static class NameStartsWithFilter implements PathFilter {

        private final String prefix;

        public NameStartsWithFilter(String prefix) {
            this.prefix = prefix;
        }

        @Override
        public boolean accept(Path path) {
            return path.getName().startsWith(prefix);
        }
    }
}