Java tutorial
/** * Created on Feb 3, 2009 * * Copyright 2010- The MITRE Corporation. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions andlimitations under * the License. * * $Id$ */ package org.mitre.ccv.mapred; import java.io.IOException; import java.net.URL; import java.net.URLClassLoader; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.mitre.mapred.fs.FileUtils; import org.mitre.ccv.mapred.io.KmerCountWritable; import org.mitre.ccv.mapred.io.KmerProbabilityWritable; /** * Map-reduce class for calculating the frequency of k-mers (n-grams) given the counts of k-mers and * total sequence (sample) length. * * @author Marc Colosimo */ public class CalculateKmerProbabilities extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(CalculateKmerProbabilities.class); public static final String LENGTH = "total.sequence.length"; /** * Mapper for summing the counts and turning them into freqencies * and combining the parents. */ public static class KmerFrequencyMapper extends MapReduceBase implements Mapper<Text, KmerCountWritable, Text, KmerProbabilityWritable> { private int start; private int length; @Override public void configure(JobConf job) { super.configure(job); start = Integer.parseInt(job.get(CalculateKmerCounts.START)); length = Integer.parseInt(job.get(LENGTH)); LOG.debug("Configure: " + job); } @Override public void map(Text key, KmerCountWritable value, OutputCollector<Text, KmerProbabilityWritable> output, Reporter reporter) throws IOException { // check to see if this is a k-mer we care about. Why do we need this? if (key.getLength() < start - 2 && key.getLength() - 2 > 0) { return; } reporter.setStatus(String.format("Generating frequency for %s", key.toString())); //System.out.printf("%s\n", key.toString()); /** * For single sequence: * p(kmer) = cnt(kmer)/ (L - k + 1) * where k = the windowSize (k-mer length). */ double freq = ((double) value.getCount()) / (length - key.getLength() + 1); KmerProbabilityWritable kfreq = new KmerProbabilityWritable(key.toString(), freq); kfreq.addParent(value.getParents()); output.collect(key, kfreq); } } /** * This takes {@link SequenceFiles} with values of {@link KmerCountWritable}s, * such as those generated by {@link CalculateKmerCounts}, and a total sequence length, * such as that generated by {@link TotalSequenceLength}, and generates frequencies * for k-mers within the given range (inclusive). * * @param jobConf * @param start starting window size * @param end ending window size * @param length length of all of the sequences (if negative, this will calculated the length). * @param input * @param output * @return * @throws java.lang.Exception */ public int initJob(JobConf jobConf, int start, int end, int length, String input, String output, boolean cleanLogs) throws Exception { /** * We cannot mix combiner and reducer outputs so we cannot just use * CalculateKmerCounts mapper and combiner with this reducer: * they have different output values! */ JobConf conf = new JobConf(jobConf, CalculateKmerProbabilities.class); conf.setJobName("CalculateKmerFrequency"); // Save our window size so that the tasks have access to them conf.set(CalculateKmerCounts.START, Integer.toString(start)); conf.set(CalculateKmerCounts.END, Integer.toString(end)); // remove this, we need this for frequency only! // Do we have a length or do we need to calculat it Path outputPath = new Path(output); if (length <= 0) { LOG.info("Negative or no length given."); throw new java.lang.IllegalArgumentException(String.format("Negative or no length given: %d", length)); } conf.set(LENGTH, Integer.toString(length)); // Set up mapper SequenceFileInputFormat.setInputPaths(conf, new Path(input)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapperClass(KmerFrequencyMapper.class); conf.setOutputKeyClass(Text.class); // final output key class conf.setOutputValueClass(KmerProbabilityWritable.class); // final output value class // Identity reduce is fine conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, outputPath); JobClient.runJob(conf); if (cleanLogs) { LOG.info("removing log directory"); Path path = new Path(output, "_logs"); FileSystem fs = path.getFileSystem(jobConf); fs.delete(path, true); } return 0; } static int printUsage() { System.out.println( "CalculateKmerFrequency [-libjars <classpath,...>] [-m <maps>] [-r <reduces>] -s <start> -e <end> -l <total length> <input> <output>"); System.out.printf("\twhere the default start=%d and end=%d\n", CalculateKmerCounts.DEFAULT_START, CalculateKmerCounts.DEFAULT_END); return -1; } @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf()); boolean cleanLogs = false; int start = CalculateKmerCounts.DEFAULT_START; int end = CalculateKmerCounts.DEFAULT_END; int length = -1; // @TODO: use commons getopts List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-s".equals(args[i])) { start = Integer.parseInt(args[++i]); } else if ("-e".equals(args[i])) { end = Integer.parseInt(args[++i]); } else if ("-c".equals(args[i])) { cleanLogs = true; } else if ("-l".equals(args[i])) { length = Integer.parseInt(args[++i]); } else if ("-libjars".equals(args[i])) { conf.set("tmpjars", FileUtils.validateFiles(args[++i], conf)); URL[] libjars = FileUtils.getLibJars(conf); if (libjars != null && libjars.length > 0) { // Add libjars to client/tasks classpath conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader())); // Adds libjars to our classpath Thread.currentThread().setContextClassLoader( new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader())); } } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } if (length <= 0) { System.out.println("ERROR: Requires total length of sequence to be > 0"); return printUsage(); } //return initJob(conf, inTable, sb.toString().trim(), new Path(other_args.get(1))); return initJob(conf, start, end, length, other_args.get(0), other_args.get(1), cleanLogs); } static public void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new CalculateKmerProbabilities(), args); System.exit(res); } }