Java tutorial
// Copyright (c) 2010 Aalto University // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. package fi.tkk.ics.hadoop.bam.cli.plugins.chipster; import hbparquet.hadoop.util.ContextUtil; import java.io.DataInput; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.FilterOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.BlockCompressedOutputStream; import net.sf.samtools.util.BlockCompressedStreamConstants; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.InputSampler; import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner; import fi.tkk.ics.hadoop.bam.AnySAMInputFormat; import fi.tkk.ics.hadoop.bam.BAMRecordReader; import fi.tkk.ics.hadoop.bam.SAMRecordWritable; import fi.tkk.ics.hadoop.bam.cli.CLIMRPlugin; import fi.tkk.ics.hadoop.bam.cli.Utils; import fi.tkk.ics.hadoop.bam.custom.jargs.gnu.CmdLineParser; import fi.tkk.ics.hadoop.bam.custom.jargs.gnu.CmdLineParser.Option.BooleanOption; import fi.tkk.ics.hadoop.bam.util.Pair; import fi.tkk.ics.hadoop.bam.util.Timer; public final class Summarize extends CLIMRPlugin { private static final List<Pair<CmdLineParser.Option, String>> optionDescs = new ArrayList<Pair<CmdLineParser.Option, String>>(); private static final CmdLineParser.Option sortOpt = new BooleanOption('s', "sort"), noTrustExtsOpt = new BooleanOption("no-trust-exts"); public Summarize() { super("summarize", "summarize SAM or BAM for zooming", "3.1", "WORKDIR LEVELS INPATH", optionDescs, "Outputs, for each level in LEVELS, a summary file describing the " + "average number of alignments at various positions in the SAM or " + "BAM file in INPATH. The summary files are placed in parts in " + "WORKDIR." + "\n\n" + "LEVELS should be a comma-separated list of positive integers. " + "Each level is the number of alignments that are summarized into " + "one group."); } static { optionDescs.add(new Pair<CmdLineParser.Option, String>(outputPathOpt, "output complete summary files to the directory " + "PATH, removing the parts from WORKDIR")); optionDescs.add(new Pair<CmdLineParser.Option, String>(sortOpt, "sort created summaries by position")); optionDescs.add(new Pair<CmdLineParser.Option, String>(noTrustExtsOpt, "detect SAM/BAM files only by contents, never by " + "file extension")); } private final Timer t = new Timer(); private String[] levels; private Path wrkDir, mainSortOutputDir; private String wrkFile; private boolean sorted = false; private int missingArg(String s) { System.err.printf("summarize :: %s not given.\n", s); return 3; } @Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); switch (args.size()) { case 0: return missingArg("WORKDIR"); case 1: return missingArg("LEVELS"); case 2: return missingArg("INPATH"); default: break; } if (!cacheAndSetProperties(parser)) return 3; levels = args.get(1).split(","); for (String l : levels) { try { int lvl = Integer.parseInt(l); if (lvl > 0) continue; System.err.printf("summarize :: summary level '%d' is not positive!\n", lvl); } catch (NumberFormatException e) { System.err.printf("summarize :: summary level '%s' is not an integer!\n", l); } return 3; } wrkDir = new Path(args.get(0)); final Path bam = new Path(args.get(2)); final boolean sort = parser.getBoolean(sortOpt); final Configuration conf = getConf(); conf.setBoolean(AnySAMInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt)); // Used by Utils.getMergeableWorkFile() to name the output files. wrkFile = bam.getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile); conf.setStrings(SummarizeReducer.SUMMARY_LEVELS_PROP, levels); try { try { // There's a lot of different Paths here, and it can get a bit // confusing. Here's how it works: // // - outPath is the output dir for the final merged output, given // with the -o parameter. // // - wrkDir is the user-given path where the outputs of the // reducers go. // // - mergedTmpDir (defined further below) is $wrkDir/sort.tmp: if // we are sorting, the summaries output in the first Hadoop job // are merged in there. // // - mainSortOutputDir is $wrkDir/sorted.tmp: getSortOutputDir() // gives a per-level/strand directory under it, which is used by // doSorting() and mergeOne(). This is necessary because we // cannot have multiple Hadoop jobs outputting into the same // directory at the same time, as explained in the comment in // sortMerged(). // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); mainSortOutputDir = sort ? new Path(wrkDir, "sorted.tmp") : null; if (!runSummary(bam)) return 4; } catch (IOException e) { System.err.printf("summarize :: Summarizing failed: %s\n", e); return 4; } Path mergedTmpDir = null; try { if (sort) { mergedTmpDir = new Path(wrkDir, "sort.tmp"); mergeOutputs(mergedTmpDir); } else if (outPath != null) mergeOutputs(outPath); } catch (IOException e) { System.err.printf("summarize :: Merging failed: %s\n", e); return 5; } if (sort) { if (!doSorting(mergedTmpDir)) return 6; // Reset this since SummarySort uses it. conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile); tryDelete(mergedTmpDir); if (outPath != null) try { sorted = true; mergeOutputs(outPath); } catch (IOException e) { System.err.printf("summarize :: Merging sorted output failed: %s\n", e); return 7; } else { // Move the unmerged results out of the mainSortOutputDir // subdirectories to wrkDir. System.out.println("summarize :: Moving outputs from temporary directories..."); t.start(); try { final FileSystem fs = wrkDir.getFileSystem(conf); for (String lvl : levels) { final FileStatus[] parts; try { parts = fs.globStatus(new Path(new Path(mainSortOutputDir, lvl + "[fr]"), "*-[0-9][0-9][0-9][0-9][0-9][0-9]")); } catch (IOException e) { System.err.printf("summarize :: Couldn't move level %s results: %s", lvl, e); continue; } for (FileStatus part : parts) { final Path path = part.getPath(); try { fs.rename(path, new Path(wrkDir, path.getName())); } catch (IOException e) { System.err.printf("summarize :: Couldn't move '%s': %s", path, e); } } } } catch (IOException e) { System.err.printf("summarize :: Moving results failed: %s", e); } System.out.printf("summarize :: Moved in %d.%03d s.\n", t.stopS(), t.fms()); } tryDelete(mainSortOutputDir); } } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } return 0; } private boolean runSummary(Path bamPath) throws IOException, ClassNotFoundException, InterruptedException { final Configuration conf = getConf(); Utils.configureSampling(wrkDir, bamPath.getName(), conf); final Job job = new Job(conf); job.setJarByClass(Summarize.class); job.setMapperClass(Mapper.class); job.setReducerClass(SummarizeReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Range.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(RangeCount.class); job.setInputFormatClass(SummarizeInputFormat.class); job.setOutputFormatClass(SummarizeOutputFormat.class); FileInputFormat.setInputPaths(job, bamPath); FileOutputFormat.setOutputPath(job, wrkDir); job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("summarize :: Sampling..."); t.start(); InputSampler.<LongWritable, Range>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, Range>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("summarize :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); for (String lvl : levels) { MultipleOutputs.addNamedOutput(job, getSummaryName(lvl, false), SummarizeOutputFormat.class, NullWritable.class, Range.class); MultipleOutputs.addNamedOutput(job, getSummaryName(lvl, true), SummarizeOutputFormat.class, NullWritable.class, Range.class); } job.submit(); System.out.println("summarize :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("summarize :: Job failed."); return false; } System.out.printf("summarize :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); return true; } private void mergeOutputs(Path out) throws IOException { System.out.println("summarize :: Merging output..."); t.start(); final Configuration conf = getConf(); final FileSystem srcFS = wrkDir.getFileSystem(conf); final FileSystem dstFS = out.getFileSystem(conf); final Timer tl = new Timer(); for (String l : levels) { mergeOne(l, false, out, srcFS, dstFS, tl); mergeOne(l, true, out, srcFS, dstFS, tl); } System.out.printf("summarize :: Merging complete in %d.%03d s.\n", t.stopS(), t.fms()); } private void mergeOne(String lvl, boolean reverseStrand, Path out, FileSystem srcFS, FileSystem dstFS, Timer t) throws IOException { t.start(); final char strand = reverseStrand ? 'r' : 'f'; final OutputStream outs = dstFS.create(new Path(out, getFinalSummaryName(lvl, reverseStrand))); Utils.mergeInto(outs, sorted ? getSortOutputDir(lvl, strand) : wrkDir, "", "-" + getSummaryName(lvl, reverseStrand), getConf(), null); // Don't forget the BGZF terminator. outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); outs.close(); System.out.printf("summarize :: Merged %s%c in %d.%03d s.\n", lvl, strand, t.stopS(), t.fms()); } private boolean doSorting(Path inputDir) throws ClassNotFoundException, InterruptedException { final Configuration conf = getConf(); final Job[] jobs = new Job[2 * levels.length]; boolean errors = false; for (int i = 0; i < levels.length; ++i) { final String lvl = levels[i]; try { // Each job has to run in a separate directory because // FileOutputCommitter deletes the _temporary within whenever a job // completes - that is, not only the subdirectories in _temporary // that are specific to that job, but all of _temporary. // // It's easier to just give different temporary output directories // here than to override that behaviour. jobs[2 * i] = SummarySort.sortOne(conf, new Path(inputDir, getFinalSummaryName(lvl, false)), getSortOutputDir(lvl, 'f'), "summarize", " for sorting " + lvl + 'f'); jobs[2 * i + 1] = SummarySort.sortOne(conf, new Path(inputDir, getFinalSummaryName(lvl, true)), getSortOutputDir(lvl, 'r'), "summarize", " for sorting " + lvl + 'r'); } catch (IOException e) { System.err.printf("summarize :: Submitting sorting job %s failed: %s\n", lvl, e); if (i == 0) return false; else errors = true; } } System.out.println("summarize :: Waiting for sorting jobs' completion..."); t.start(); for (int i = 0; i < jobs.length; ++i) { boolean success; try { success = jobs[i].waitForCompletion(verbose); } catch (IOException e) { success = false; } final String l = levels[i / 2]; final char s = i % 2 == 0 ? 'f' : 'r'; if (!success) { System.err.printf("summarize :: Sorting job for %s%c failed.\n", l, s); errors = true; continue; } System.out.printf("summarize :: Sorting job for %s%c complete.\n", l, s); } if (errors) return false; System.out.printf("summarize :: Jobs complete in %d.%03d s.\n", t.stopS(), t.fms()); return true; } private String getFinalSummaryName(String lvl, boolean reverseStrand) { return wrkFile + "-" + getSummaryName(lvl, reverseStrand); } /*package*/ static String getSummaryName(String lvl, boolean reverseStrand) { return "summary" + lvl + (reverseStrand ? 'r' : 'f'); } private Path getSortOutputDir(String level, char strand) { return new Path(mainSortOutputDir, level + strand); } private void tryDelete(Path path) { try { path.getFileSystem(getConf()).delete(path, true); } catch (IOException e) { System.err.printf("summarize :: Warning: couldn't delete '%s': %s\n", path, e); } } } final class SummarizeReducer extends Reducer<LongWritable, Range, NullWritable, RangeCount> { public static final String SUMMARY_LEVELS_PROP = "summarize.summary.levels"; private MultipleOutputs<NullWritable, RangeCount> mos; // For the reverse and forward strands, respectively. private final List<SummaryGroup> summaryGroupsR = new ArrayList<SummaryGroup>(), summaryGroupsF = new ArrayList<SummaryGroup>(); private final RangeCount summary = new RangeCount(); // This is a safe initial choice: it doesn't matter whether the first actual // reference ID we get matches this or not, since all summaryLists are empty // anyway. private int currentReferenceID = 0; @Override public void setup(Reducer<LongWritable, Range, NullWritable, RangeCount>.Context ctx) { mos = new MultipleOutputs<NullWritable, RangeCount>(ctx); for (String s : ContextUtil.getConfiguration(ctx).getStrings(SUMMARY_LEVELS_PROP)) { int lvl = Integer.parseInt(s); summaryGroupsR.add(new SummaryGroup(lvl, Summarize.getSummaryName(s, true))); summaryGroupsF.add(new SummaryGroup(lvl, Summarize.getSummaryName(s, false))); } } @Override protected void reduce(LongWritable key, Iterable<Range> ranges, Reducer<LongWritable, Range, NullWritable, RangeCount>.Context context) throws IOException, InterruptedException { final int referenceID = (int) (key.get() >>> 32); // When the reference sequence changes we have to flush out everything // we've got and start from scratch again. if (referenceID != currentReferenceID) { currentReferenceID = referenceID; doAllSummaries(); } for (final Range range : ranges) { final int beg = range.beg.get(), end = range.end.get(); final List<SummaryGroup> summaryGroups = range.reverseStrand.get() ? summaryGroupsR : summaryGroupsF; for (SummaryGroup group : summaryGroups) { group.sumBeg += beg; group.sumEnd += end; if (++group.count == group.level) doSummary(group); } } } @Override protected void cleanup(Reducer<LongWritable, Range, NullWritable, RangeCount>.Context context) throws IOException, InterruptedException { // Don't lose any remaining ones at the end. doAllSummaries(); mos.close(); } private void doAllSummaries() throws IOException, InterruptedException { for (SummaryGroup group : summaryGroupsR) if (group.count > 0) doSummary(group); for (SummaryGroup group : summaryGroupsF) if (group.count > 0) doSummary(group); } private void doSummary(SummaryGroup group) throws IOException, InterruptedException { // The reverseStrand flag is already represented in which group is passed // to this method, so there's no need to set it in summary.range. summary.rid.set(currentReferenceID); summary.range.beg.set((int) (group.sumBeg / group.count)); summary.range.end.set((int) (group.sumEnd / group.count)); summary.count.set(group.count); mos.write(NullWritable.get(), summary, group.outName); group.reset(); } } final class Range implements Writable { public final IntWritable beg = new IntWritable(); public final IntWritable end = new IntWritable(); public final BooleanWritable reverseStrand = new BooleanWritable(); public Range() { } public Range(int b, int e, boolean rev) { beg.set(b); end.set(e); reverseStrand.set(rev); } public int getCentreOfMass() { return (int) (((long) beg.get() + end.get()) / 2); } @Override public void write(DataOutput out) throws IOException { beg.write(out); end.write(out); reverseStrand.write(out); } @Override public void readFields(DataInput in) throws IOException { beg.readFields(in); end.readFields(in); reverseStrand.readFields(in); } } final class RangeCount implements Comparable<RangeCount>, Writable { public final Range range = new Range(); public final IntWritable count = new IntWritable(); public final IntWritable rid = new IntWritable(); // This is what the TextOutputFormat will write. The format is // tabix-compatible; see http://samtools.sourceforge.net/tabix.shtml. // // It might not be sorted by range.beg though! With the centre of mass // approach, it most likely won't be. @Override public String toString() { return rid + "\t" + range.beg + "\t" + range.end + "\t" + count; } // Comparisons only take into account the leftmost position. @Override public int compareTo(RangeCount o) { return Integer.valueOf(range.beg.get()).compareTo(o.range.beg.get()); } @Override public void write(DataOutput out) throws IOException { range.write(out); count.write(out); rid.write(out); } @Override public void readFields(DataInput in) throws IOException { range.readFields(in); count.readFields(in); rid.readFields(in); } } // We want the centre of mass to be used as (the low order bits of) the key // already at this point, because we want a total order so that we can // meaningfully look at consecutive ranges in the reducers. If we were to set // the final key in the mapper, the partitioner wouldn't use it. // // And since getting the centre of mass requires calculating the Range as well, // we might as well get that here as well. final class SummarizeInputFormat extends FileInputFormat<LongWritable, Range> { private AnySAMInputFormat baseIF = null; private void initBaseIF(final Configuration conf) { if (baseIF == null) baseIF = new AnySAMInputFormat(conf); } @Override protected boolean isSplitable(JobContext job, Path path) { initBaseIF(ContextUtil.getConfiguration(job)); return baseIF.isSplitable(job, path); } @Override public List<InputSplit> getSplits(JobContext job) throws IOException { initBaseIF(ContextUtil.getConfiguration(job)); return baseIF.getSplits(job); } @Override public RecordReader<LongWritable, Range> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { initBaseIF(ContextUtil.getConfiguration(ctx)); final RecordReader<LongWritable, Range> rr = new SummarizeRecordReader( baseIF.createRecordReader(split, ctx)); rr.initialize(split, ctx); return rr; } } final class SummarizeRecordReader extends RecordReader<LongWritable, Range> { private final RecordReader<LongWritable, SAMRecordWritable> baseRR; private final LongWritable key = new LongWritable(); private final List<Range> ranges = new ArrayList<Range>(); private int rangeIdx = 0; public SummarizeRecordReader(RecordReader<LongWritable, SAMRecordWritable> rr) { baseRR = rr; } @Override public void initialize(InputSplit spl, TaskAttemptContext ctx) { } @Override public void close() throws IOException { baseRR.close(); } @Override public float getProgress() throws InterruptedException, IOException { return baseRR.getProgress(); } @Override public LongWritable getCurrentKey() { return key; } @Override public Range getCurrentValue() { return ranges.get(rangeIdx); } @Override public boolean nextKeyValue() throws InterruptedException, IOException { if (rangeIdx + 1 < ranges.size()) { ++rangeIdx; key.set(key.get() >>> 32 << 32 | getCurrentValue().getCentreOfMass()); return true; } SAMRecord rec; do { if (!baseRR.nextKeyValue()) return false; rec = baseRR.getCurrentValue().get(); } while (rec.getReadUnmappedFlag() || rec.getReferenceIndex() < 0 || rec.getAlignmentStart() < 0); parseCIGAR(rec, rec.getReadNegativeStrandFlag()); rangeIdx = 0; key.set(BAMRecordReader.getKey0(rec.getReferenceIndex(), getCurrentValue().getCentreOfMass())); return true; } void parseCIGAR(SAMRecord rec, boolean reverseStrand) { ranges.clear(); final Cigar cigar = rec.getCigar(); int begPos = rec.getAlignmentStart(); int endPos = begPos; for (int i = 0; i < rec.getCigarLength(); ++i) { final CigarElement element = cigar.getCigarElement(i); final CigarOperator op = element.getOperator(); switch (op) { case M: case EQ: case X: // Accumulate this part into the current range. endPos += element.getLength(); continue; default: break; } if (begPos != endPos) { // No more consecutive fully contained parts: save the range and // move along. ranges.add(new Range(begPos, endPos - 1, reverseStrand)); begPos = endPos; } if (op.consumesReferenceBases()) { begPos += element.getLength(); endPos = begPos; } } if (begPos != endPos) ranges.add(new Range(begPos, endPos - 1, reverseStrand)); } } final class SummarizeOutputFormat extends TextOutputFormat<NullWritable, RangeCount> { @Override public RecordWriter<NullWritable, RangeCount> getRecordWriter(TaskAttemptContext ctx) throws IOException { Path path = getDefaultWorkFile(ctx, ""); FileSystem fs = path.getFileSystem(ContextUtil.getConfiguration(ctx)); final OutputStream file = fs.create(path); return new TextOutputFormat.LineRecordWriter<NullWritable, RangeCount>( new DataOutputStream(new FilterOutputStream(new BlockCompressedOutputStream(file, null)) { @Override public void close() throws IOException { // Don't close the BlockCompressedOutputStream, so we don't // get an end-of-file sentinel. this.out.flush(); // Instead, close the file stream directly. file.close(); } })); } @Override public Path getDefaultWorkFile(TaskAttemptContext ctx, String ext) throws IOException { // From MultipleOutputs. If we had a later version of FileOutputFormat as // well, we'd use super.getOutputName(). String summaryName = ContextUtil.getConfiguration(ctx).get("mapreduce.output.basename"); // A RecordWriter is created as soon as a reduce task is started, even // though MultipleOutputs eventually overrides it with its own. // // To avoid creating a file called "inputfilename-null" when that // RecordWriter is initialized, make it a hidden file instead, like this. // // We can't use a filename we'd use later, because TextOutputFormat would // throw later on, as the file would already exist. String prefix = summaryName == null ? ".unused_" : ""; return Utils.getMergeableWorkFile(super.getDefaultWorkFile(ctx, ext).getParent(), prefix, "-" + summaryName, ctx, ext); } // Allow the output directory to exist. @Override public void checkOutputSpecs(JobContext job) { } } final class SummaryGroup { public int count; public final int level; public long sumBeg, sumEnd; public final String outName; public SummaryGroup(int lvl, String name) { level = lvl; outName = name; reset(); } public void reset() { sumBeg = sumEnd = 0; count = 0; } }