org.mitre.ccv.mapred.CalculateKmerCounts.java Source code

Java tutorial

Introduction

Here is the source code for org.mitre.ccv.mapred.CalculateKmerCounts.java

Source

/**
 * Created on March 25, 2009.
 *
 * Copyright 2010- The MITRE Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you  may not 
 * use this file except in compliance with the License. You may obtain a copy of 
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 * License for the specific language governing permissions andlimitations under
 * the License.
 *
 * $Id$
 */
package org.mitre.ccv.mapred;

import java.io.IOException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.mitre.mapred.fs.FileUtils;
import org.mitre.ccv.mapred.io.KmerCountWritable;

/**
 * Map-reduce class that calculates the counts of k-mers (n-grams) from all of the sequences.
 * <p>This generates k-mers for start - 2 to end.</p>
 *
 * By setting the system property (-D) "kmer.count.parent.fast.map" to "true" (or "T") this will use an internal {@link java.util.Map}.
 * Note that the MapTask might run out of memory.
 * 
 * @author Marc Colosimo
 */
public class CalculateKmerCounts extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(CalculateKmerCounts.class);
    public static final int DEFAULT_START = 3;
    public static final int DEFAULT_END = 9;
    public static final String START = "kmer.window.start.size";
    public static final String END = "kmer.window.end.size";
    public static final String LENGTH = "total.sequence.length";
    public static final String FAST_MAP = "kmer.count.parent.fast.map";

    /**
     * Mapper that generate the k-mer counts and emits their parents.
     *
     * <P>If <code>FAST_MAP</code> is set then this will use a HashMap to limit
     * the childern of a k-mer at the expense of local memory.</P>
     */
    public static class KmerCountMap extends MapReduceBase implements Mapper<Text, Text, Text, KmerCountWritable> {

        private int start;
        private int end;
        private Text kmerText = new Text();
        private KmerCountWritable kmerCnts = new KmerCountWritable();
        private HashMap<String, Integer> kmap = null;
        private Integer kmapValue = new Integer(1);

        @Override
        public void configure(JobConf job) {
            super.configure(job);
            start = Integer.parseInt(job.get(START));
            end = Integer.parseInt(job.get(END));

            String fast = job.get(FAST_MAP, "F");
            if (fast.equalsIgnoreCase("T") || fast.equalsIgnoreCase("true")) {
                kmap = new HashMap<String, Integer>();
                LOG.info("Using fast maps");
            } else {
                kmap = null;
            }
            LOG.debug("Configure: " + job);
        }

        /**
         * Generates kmer counts and their parents between the given start/end.
         * 
         * @param key           Sequence name
         * @param value         The sequence
         * @param output        output collector
         * @param reporter      reporter
         * @throws java.io.IOException
         */
        @Override
        public void map(Text key, Text value, OutputCollector<Text, KmerCountWritable> output, Reporter reporter)
                throws IOException {

            String seq = value.toString();

            /**
             * CalculateCompostionVectors checks the sizes (actually IndexedCompositionDistribution does)
             */
            if (this.end > seq.length()) {
                LOG.info(String.format("%s length (%d) is smaller than the end window size (%d). Skipping..",
                        key.toString(), seq.length(), this.end));
                return;
            }
            for (int windowSize = this.start - 2; windowSize <= this.end; windowSize++) {
                reporter.setStatus(String.format("%s: window size of %d\n", key.toString(), windowSize));
                //System.err.printf("%s: window size of %d\n", key.toString(), windowSize);
                for (int i = 0; i < seq.length() - windowSize + 1; i++) {
                    String kmer = seq.substring(i, i + windowSize);
                    kmerText.set(kmer);
                    kmerCnts.set(kmer, 1);
                    output.collect(kmerText, kmerCnts);

                    // now check to see if this can be a parent
                    if (kmer.length() >= this.start) {

                        if (this.kmap != null && this.kmap.containsKey(kmer)) {
                            //System.out.println("Seen: " + kmer);
                            continue;
                        }
                        if (this.kmap != null) {
                            this.kmap.put(kmer, kmapValue);
                            //System.out.println("New: " + kmer);
                        }

                        // Get a1..ak-1
                        String subst = kmer.substring(0, kmer.length() - 1);

                        kmerText.set(subst);
                        kmerCnts.set(subst); // count of zero !
                        kmerCnts.addParent(kmer);
                        output.collect(kmerText, kmerCnts);

                        // Get a2..ak
                        subst = kmer.substring(1, kmer.length());
                        kmerText.set(subst);
                        kmerCnts.set(subst); // count of zero !
                        kmerCnts.addParent(kmer);
                        output.collect(kmerText, kmerCnts);

                        // Get a2..ak-1
                        subst = kmer.substring(1, kmer.length() - 1);
                        kmerText.set(subst);
                        kmerCnts.set(subst); // count of zero !
                        kmerCnts.addParent(kmer);
                        output.collect(kmerText, kmerCnts);
                    }
                }
            }
        }
    }

    /**
     * Both a combiner and reducer for summing the counts and combining the parents.
     */
    public static class KmerCountReducer extends MapReduceBase
            implements Reducer<Text, KmerCountWritable, Text, KmerCountWritable> {

        @Override
        public void reduce(Text key, Iterator<KmerCountWritable> values,
                OutputCollector<Text, KmerCountWritable> output, Reporter reporter) throws IOException {
            int sum = 0;
            KmerCountWritable kcnt;
            HashSet<String> parents = new HashSet<String>();
            while (values.hasNext()) {
                kcnt = values.next();
                sum += kcnt.getCount();
                parents.addAll(kcnt.getParents());
            }
            kcnt = new KmerCountWritable(key.toString(), sum);
            kcnt.addParent(parents);
            //System.err.printf("kmer=%s\tcount=%d\tnumParents=%d", key.toString(), sum, parents.size());
            output.collect(key, kcnt);
        }
    }

    /**
     * Start up a map-reduce job with the given parameters.
     *
     * <P>Setting the system property "kmer.count.parent.fast.map" will result in this using a {@link java.util.Map}
     * to speed up the output of kmers at the expense of memory.
     *
     * @param jobConf
     * @param start     starting window size
     * @param end       ending window size
     * @param input
     * @param output
     * @return
     * @throws java.lang.Exception
     */
    public int initJob(JobConf jobConf, int start, int end, String input, String output) throws Exception {
        JobConf conf = new JobConf(jobConf, CalculateKmerCounts.class);
        conf.setJobName("CalculateKmerCounts");

        if (start <= 2)
            throw new IllegalArgumentException("Value of 'start' argument must be larger than 2");

        // Save our window size so that the tasks have access to them
        conf.set(START, Integer.toString(start));
        conf.set(END, Integer.toString(end));
        //conf.set(FAST_MAP, fastMap ? "Y":"N");

        // Set up mapper
        SequenceFileInputFormat.setInputPaths(conf, new Path(input));
        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setMapperClass(KmerCountMap.class);
        conf.setOutputKeyClass(Text.class); // map output key class
        conf.setOutputValueClass(KmerCountWritable.class); // map output value class

        // Set up combiner/reducer
        conf.setCombinerClass(KmerCountReducer.class);
        conf.setReducerClass(KmerCountReducer.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(conf, new Path(output));

        JobClient.runJob(conf);

        return 0;
    }

    static int printUsage() {
        System.out.println(
                "CalculateKmerCounts [-libjars <classpath,...>] [-m <maps>] [-r <reduces>]  [-f] [-s <start>] [-e <end>] <input> <output>");
        System.out.printf("\twhere the default start=%d and end=%d\n", DEFAULT_START, DEFAULT_END);
        System.out.println(
                "\t-f will use a HashMap on the mappers to limit the number of k-mers the reducers have to sort through");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf());
        int start = DEFAULT_START;
        int end = DEFAULT_END;

        // @TODO: use commons getopts
        List<String> other_args = new ArrayList<String>();
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-m".equals(args[i])) {
                    conf.setNumMapTasks(Integer.parseInt(args[++i]));
                } else if ("-r".equals(args[i])) {
                    conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else if ("-s".equals(args[i])) {
                    start = Integer.parseInt(args[++i]);
                } else if ("-e".equals(args[i])) {
                    end = Integer.parseInt(args[++i]);
                } else if ("-f".equals(args[i])) {
                    conf.get(FAST_MAP, "true");
                } else if ("-libjars".equals(args[i])) {
                    conf.set("tmpjars", FileUtils.validateFiles(args[++i], conf));

                    URL[] libjars = FileUtils.getLibJars(conf);
                    if (libjars != null && libjars.length > 0) {
                        // Add libjars to client/tasks classpath
                        conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader()));
                        // Adds libjars to our classpath
                        Thread.currentThread().setContextClassLoader(
                                new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader()));
                    }
                } else {
                    other_args.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage();
            }
        }
        // Make sure there are exactly 2 parameters left.
        if (other_args.size() != 2) {
            System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");

            return printUsage();
        }

        return initJob(conf, start, end, other_args.get(0), other_args.get(1));

    }

    static public void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new CalculateKmerCounts(), args);
        System.exit(res);
    }
}