org.mitre.ccv.mapred.InvertKmerProbabilities.java Source code

Java tutorial

Introduction

Here is the source code for org.mitre.ccv.mapred.InvertKmerProbabilities.java

Source

/**
 * Created on March 26, 2009.
 *
 * Copyright 2010- The MITRE Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you  may not 
 * use this file except in compliance with the License. You may obtain a copy of 
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 * License for the specific language governing permissions andlimitations under
 * the License.
 *
 * $Id$
 */
package org.mitre.ccv.mapred;

import java.io.IOException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.mitre.mapred.fs.FileUtils;
import org.mitre.ccv.mapred.io.KmerProbabilityMapWritable;
import org.mitre.ccv.mapred.io.KmerProbabilityWritable;

/**
 * Map-Reduce class for inverting the parents of k-mer probabilities, such  as
 * those generated by {@link CalculateKmerProbabilities}, from a {@link SequenceFile}
 * with k-mers (@link Text} keys and {@link KmerProbabilityWritable} values to
 * a SequenceFile with the parent of the k-mer being the new key and the value
 * (probability) of the given class set.
 *
 * @author Marc Colosimo
 */
public class InvertKmerProbabilities extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(InvertKmerProbabilities.class);

    /**
     * Mapper for inverting the frequencies generating a k-mer with all of its
     * required substrings for generating its Pi-value.
     */
    public static class InverterMapper extends MapReduceBase
            implements Mapper<Text, KmerProbabilityWritable, Text, KmerProbabilityMapWritable> {

        private Text kmerKey = new Text();

        @Override
        public void map(Text key, KmerProbabilityWritable value,
                OutputCollector<Text, KmerProbabilityMapWritable> output, Reporter reporter) throws IOException {

            KmerProbabilityMapWritable map = new KmerProbabilityMapWritable(key.toString(), value.getFrequency());
            reporter.setStatus(String.format("Inverting %s (%f) with %d parents\n", key.toString(),
                    value.getFrequency(), value.getParents().size()));
            //System.out.printf("Inverting %s (%f) with %d parents\n", key.toString(), value.getFrequency(), value.getParents().size());
            output.collect(key, map);

            // Now emit all of the parents as key-map pairs so that we can combine them
            for (Iterator<String> iter = value.getParents().iterator(); iter.hasNext();) {

                String kmer = iter.next();
                map = new KmerProbabilityMapWritable();
                map.setKey(kmer);
                kmerKey.set(kmer);

                // Add this to the parent
                map.set(key.toString(), value.getFrequency());
                //System.out.printf("\tInverting Parent: %s\n", kmer);
                output.collect(kmerKey, map);
            }
        }
    }

    /**
     * Both a combiner and reducer for keys with their frequencies and now substrings.
     */
    public static class InverterReducer extends MapReduceBase
            implements Reducer<Text, KmerProbabilityMapWritable, Text, KmerProbabilityMapWritable> {

        @Override
        public void reduce(Text key, Iterator<KmerProbabilityMapWritable> values,
                OutputCollector<Text, KmerProbabilityMapWritable> output, Reporter reporter) throws IOException {

            reporter.setStatus("Reducing " + key.toString());
            //System.err.printf("k=%s\n", key.toString());
            KmerProbabilityMapWritable map = new KmerProbabilityMapWritable();
            map.setKey(key.toString());
            while (values.hasNext()) {
                KmerProbabilityMapWritable vmap = values.next();
                for (Iterator<Entry<String, Double>> iter = vmap.get().entrySet().iterator(); iter.hasNext();) {
                    Entry<String, Double> entry = iter.next();
                    map.set(entry.getKey(), entry.getValue());
                    //System.err.printf("\tk=%s\tv=%f\n", entry.getKey(), entry.getValue());
                }
            }
            //System.err.println();
            output.collect(key, map);
        }
    }

    /**
     * Start up the job with the given parameters.
     * 
     * @param jobConf       The {@link JobConf} to use
     * @param input         path to the {@link SequenceFile}s
     * @param output        path to save the output
     * @param cleanLogs     if <code>true</code> remove the logs
     * @return
     * @throws java.lang.Exception
     */
    public int initJob(JobConf jobConf, String input, String output, boolean cleanLogs) throws Exception {
        JobConf conf = new JobConf(jobConf, InvertKmerProbabilities.class);
        conf.setJobName("InvertKmerFrequencies");

        // Set up mapper
        SequenceFileInputFormat.setInputPaths(conf, new Path(input));
        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setMapperClass(InverterMapper.class);
        conf.setOutputKeyClass(Text.class); // final output key class
        conf.setOutputValueClass(KmerProbabilityMapWritable.class); // final output value class

        // Set up combiner/reducer
        conf.setCombinerClass(InverterReducer.class);
        conf.setReducerClass(InverterReducer.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(conf, new Path(output));

        JobClient.runJob(conf);

        if (cleanLogs) {
            LOG.info("removing log directory");
            Path path = new Path(output, "_logs");
            FileSystem fs = path.getFileSystem(jobConf);
            fs.delete(path, true);
        }
        return 0;
    }

    static int printUsage() {
        System.out.println(
                "InvertKmerFrequencies [-libjars <classpath,...>] [-m <maps>] [-r <reduces>] [-c] <input> <output>");
        return -1;
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf());
        boolean cleanLogs = false;

        List<String> other_args = new ArrayList<String>();
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-m".equals(args[i])) {
                    conf.setNumMapTasks(Integer.parseInt(args[++i]));
                } else if ("-r".equals(args[i])) {
                    conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else if ("-c".equals(args[i])) {
                    cleanLogs = true;
                } else if ("-libjars".equals(args[i])) {
                    conf.set("tmpjars", FileUtils.validateFiles(args[++i], conf));

                    URL[] libjars = FileUtils.getLibJars(conf);
                    if (libjars != null && libjars.length > 0) {
                        // Add libjars to client/tasks classpath
                        conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader()));
                        // Adds libjars to our classpath
                        Thread.currentThread().setContextClassLoader(
                                new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader()));
                    }
                } else {
                    other_args.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage();
            }
        }
        // Make sure there are exactly 2 parameters left.
        if (other_args.size() != 2) {
            System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
            return printUsage();
        }

        return initJob(conf, other_args.get(0), other_args.get(1), cleanLogs);

    }

    static public void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new InvertKmerProbabilities(), args);
        System.exit(res);
    }
}