Java tutorial
/** * Created on March 26, 2009. * * Copyright 2010- The MITRE Corporation. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions andlimitations under * the License. * * $Id$ */ package org.mitre.ccv.mapred; import java.io.IOException; import java.net.URL; import java.net.URLClassLoader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.mitre.mapred.fs.FileUtils; import org.mitre.ccv.mapred.io.KmerProbabilityMapWritable; import org.mitre.ccv.mapred.io.KmerProbabilityWritable; /** * Map-Reduce class for inverting the parents of k-mer probabilities, such as * those generated by {@link CalculateKmerProbabilities}, from a {@link SequenceFile} * with k-mers (@link Text} keys and {@link KmerProbabilityWritable} values to * a SequenceFile with the parent of the k-mer being the new key and the value * (probability) of the given class set. * * @author Marc Colosimo */ public class InvertKmerProbabilities extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(InvertKmerProbabilities.class); /** * Mapper for inverting the frequencies generating a k-mer with all of its * required substrings for generating its Pi-value. */ public static class InverterMapper extends MapReduceBase implements Mapper<Text, KmerProbabilityWritable, Text, KmerProbabilityMapWritable> { private Text kmerKey = new Text(); @Override public void map(Text key, KmerProbabilityWritable value, OutputCollector<Text, KmerProbabilityMapWritable> output, Reporter reporter) throws IOException { KmerProbabilityMapWritable map = new KmerProbabilityMapWritable(key.toString(), value.getFrequency()); reporter.setStatus(String.format("Inverting %s (%f) with %d parents\n", key.toString(), value.getFrequency(), value.getParents().size())); //System.out.printf("Inverting %s (%f) with %d parents\n", key.toString(), value.getFrequency(), value.getParents().size()); output.collect(key, map); // Now emit all of the parents as key-map pairs so that we can combine them for (Iterator<String> iter = value.getParents().iterator(); iter.hasNext();) { String kmer = iter.next(); map = new KmerProbabilityMapWritable(); map.setKey(kmer); kmerKey.set(kmer); // Add this to the parent map.set(key.toString(), value.getFrequency()); //System.out.printf("\tInverting Parent: %s\n", kmer); output.collect(kmerKey, map); } } } /** * Both a combiner and reducer for keys with their frequencies and now substrings. */ public static class InverterReducer extends MapReduceBase implements Reducer<Text, KmerProbabilityMapWritable, Text, KmerProbabilityMapWritable> { @Override public void reduce(Text key, Iterator<KmerProbabilityMapWritable> values, OutputCollector<Text, KmerProbabilityMapWritable> output, Reporter reporter) throws IOException { reporter.setStatus("Reducing " + key.toString()); //System.err.printf("k=%s\n", key.toString()); KmerProbabilityMapWritable map = new KmerProbabilityMapWritable(); map.setKey(key.toString()); while (values.hasNext()) { KmerProbabilityMapWritable vmap = values.next(); for (Iterator<Entry<String, Double>> iter = vmap.get().entrySet().iterator(); iter.hasNext();) { Entry<String, Double> entry = iter.next(); map.set(entry.getKey(), entry.getValue()); //System.err.printf("\tk=%s\tv=%f\n", entry.getKey(), entry.getValue()); } } //System.err.println(); output.collect(key, map); } } /** * Start up the job with the given parameters. * * @param jobConf The {@link JobConf} to use * @param input path to the {@link SequenceFile}s * @param output path to save the output * @param cleanLogs if <code>true</code> remove the logs * @return * @throws java.lang.Exception */ public int initJob(JobConf jobConf, String input, String output, boolean cleanLogs) throws Exception { JobConf conf = new JobConf(jobConf, InvertKmerProbabilities.class); conf.setJobName("InvertKmerFrequencies"); // Set up mapper SequenceFileInputFormat.setInputPaths(conf, new Path(input)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapperClass(InverterMapper.class); conf.setOutputKeyClass(Text.class); // final output key class conf.setOutputValueClass(KmerProbabilityMapWritable.class); // final output value class // Set up combiner/reducer conf.setCombinerClass(InverterReducer.class); conf.setReducerClass(InverterReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); if (cleanLogs) { LOG.info("removing log directory"); Path path = new Path(output, "_logs"); FileSystem fs = path.getFileSystem(jobConf); fs.delete(path, true); } return 0; } static int printUsage() { System.out.println( "InvertKmerFrequencies [-libjars <classpath,...>] [-m <maps>] [-r <reduces>] [-c] <input> <output>"); return -1; } @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf()); boolean cleanLogs = false; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-c".equals(args[i])) { cleanLogs = true; } else if ("-libjars".equals(args[i])) { conf.set("tmpjars", FileUtils.validateFiles(args[++i], conf)); URL[] libjars = FileUtils.getLibJars(conf); if (libjars != null && libjars.length > 0) { // Add libjars to client/tasks classpath conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader())); // Adds libjars to our classpath Thread.currentThread().setContextClassLoader( new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader())); } } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } return initJob(conf, other_args.get(0), other_args.get(1), cleanLogs); } static public void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new InvertKmerProbabilities(), args); System.exit(res); } }