Java tutorial
/* * Copyright (C) 2009-2010 Institute for Computational Biomedicine, * Weill Medical College of Cornell University * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.cornell.med.icb.goby.modes; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import edu.cornell.med.icb.goby.algorithmic.algorithm.AnnotationCount; import edu.cornell.med.icb.goby.algorithmic.data.Annotation; import edu.cornell.med.icb.goby.algorithmic.data.AnnotationRPKM; import edu.cornell.med.icb.goby.algorithmic.data.Segment; import edu.cornell.med.icb.goby.alignments.AlignmentReader; import edu.cornell.med.icb.goby.alignments.Alignments; import edu.cornell.med.icb.goby.alignments.AlignmentReaderImpl; import edu.cornell.med.icb.goby.util.Timer; import edu.cornell.med.icb.identifier.DoubleIndexedIdentifier; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.ints.IntSet; import it.unimi.dsi.fastutil.objects.Object2ObjectMap; import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.fastutil.objects.ObjectList; import it.unimi.dsi.fastutil.objects.ObjectListIterator; import org.apache.commons.io.IOUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.Set; /** * Aggregates peaks according to their interpeak RPKM differences given a threshold. * * @author Jaaved Mohammed */ public class AggregatePeaksByRPKMDifferenceMode extends AbstractGobyMode { /** * The mode name. */ private static final String MODE_NAME = "aggregate-by-peak-rpkm-difference"; /** * The mode description help text. */ private static final String MODE_DESCRIPTION = "Aggregates peaks according to their interpeak RPKM differences given a threshold."; /** * The input files of compact reads. */ private String[] inputFilenames; /** * The output file. */ private String outputFile; /** * The annotation-like file containing the union of all peaks accross all samples. */ private String unionAnnotationFile; private int rpkmDiffThreshold; private String rpkmFileName; @Override public String getModeName() { return MODE_NAME; } @Override public String getModeDescription() { return MODE_DESCRIPTION; } /** * Configure. * * @param args command line arguments * @return this object for chaining * @throws java.io.IOException error parsing * @throws com.martiansoftware.jsap.JSAPException * error parsing */ @Override public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException { final JSAPResult jsapResult = parseJsapArguments(args); inputFilenames = jsapResult.getStringArray("input"); outputFile = jsapResult.getString("output"); unionAnnotationFile = jsapResult.getString("peak-union"); rpkmDiffThreshold = jsapResult.getInt("threshold"); rpkmFileName = jsapResult.getString("rpkm-file"); return this; } /** * Run the mode. * * @throws java.io.IOException error reading / writing */ @Override public void execute() throws IOException { System.out.println("Reading annotations from " + unionAnnotationFile); final Object2ObjectMap<String, ObjectList<AnnotationRPKM>> allAnnots = readAnnotations(unionAnnotationFile); final Timer timer = new Timer(); timer.start(); //For all input files, and given the union peaks, compute the average RPKM for each peak across all input samples for (final String inputFile : inputFilenames) { System.out.println("Reading alignment file: " + inputFile); final String inputBasename = AlignmentReaderImpl.getBasename(inputFile); final AlignmentReaderImpl reader = new AlignmentReaderImpl(inputBasename); reader.readHeader(); final int numberOfReferences = reader.getNumberOfTargets(); final DoubleIndexedIdentifier referenceIds = new DoubleIndexedIdentifier(reader.getTargetIdentifiers()); reader.close(); final AnnotationCount[] algs = new AnnotationCount[numberOfReferences]; final IntSet referencesToProcess = new IntOpenHashSet(); final Set<String> chromosomeNames = allAnnots.keySet(); // create count writers, one for each reference sequence in the alignment: for (int referenceIndex = 0; referenceIndex < numberOfReferences; referenceIndex++) { final String referenceName = referenceIds.getId(referenceIndex).toString(); if (chromosomeNames.contains(referenceName)) { // subset of reference names selected by the command line: referencesToProcess.add(referenceIndex); algs[referenceIndex] = new AnnotationCount(); algs[referenceIndex].getBaseCounter().startPopulating(); } } final AlignmentReader referenceReader = new AlignmentReaderImpl(inputBasename); referenceReader.readHeader(); // read the alignment: int numAlignedReadsInSample = 0; for (final Alignments.AlignmentEntry alignmentEntry : referenceReader) { final int referenceIndex = alignmentEntry.getTargetIndex(); if (referencesToProcess.contains(referenceIndex)) { final int startPosition = alignmentEntry.getPosition(); final int alignmentLength = alignmentEntry.getQueryAlignedLength(); //shifted the ends populating by 1 for (int i = 0; i < alignmentEntry.getMultiplicity(); ++i) { algs[referenceIndex].populate(startPosition, startPosition + alignmentLength); ++numAlignedReadsInSample; } } } reader.close(); computeAnnotationRPKM(allAnnots, referenceIds, algs, referencesToProcess, numAlignedReadsInSample); } //Now that we've finished processing all samples, compute the average of the RPKMs across all samples //Also if a rpkm file is specified, write the averaged RPKM values to this file. final int numSamples = inputFilenames.length; final Set<String> references = allAnnots.keySet(); if (rpkmFileName != null) { System.out.println("Writing the RPKM values to file: " + rpkmFileName); } for (final String reference : references) { System.out.println("Averaging RPKM for reference " + reference); final ObjectList<AnnotationRPKM> annots = allAnnots.get(reference); for (final AnnotationRPKM annot : annots) { annot.rpkm /= numSamples; } if (rpkmFileName != null) { writeRPKM(rpkmFileName, annots, true /*append*/); } } //FINALLY, we can perform the aggregation for (final String reference : references) { System.out.println("Aggregating annotations by RPKM for reference " + reference); final ObjectList<AnnotationRPKM> annots = allAnnots.get(reference); final ObjectList<Annotation> mergedAnnots = new ObjectArrayList<Annotation>(); final ObjectListIterator<AnnotationRPKM> annotsIterator = annots.listIterator(); AnnotationRPKM annot = null; AnnotationRPKM nextAnnot = null; if (annotsIterator.hasNext()) { annot = annotsIterator.next(); } while (annot != null) { final Annotation newAnnot = new Annotation(annot.getChromosome() + "." + annot.getStart(), annot.getChromosome()); newAnnot.getSegments().addAll(annot.getSegments()); if (annotsIterator.hasNext()) { nextAnnot = annotsIterator.next(); while ((nextAnnot != null) && Math.abs(annot.rpkm - nextAnnot.rpkm) <= rpkmDiffThreshold) { //Merge the two annotations newAnnot.getSegments().addAll(nextAnnot.getSegments()); if (annotsIterator.hasNext()) { nextAnnot = annotsIterator.next(); } else { nextAnnot = null; } } } newAnnot.sortSegments(); mergedAnnots.add(newAnnot); annot = nextAnnot; nextAnnot = null; } //clean out the old list to preserve memory explosion annots.clear(); //Finally write the new annotations to the output file writeAnnotations(outputFile, mergedAnnots, true /*append*/); } //DONE System.out.println("time spent " + timer.toString()); } private void computeAnnotationRPKM(final Object2ObjectMap<String, ObjectList<AnnotationRPKM>> allAnnots, final DoubleIndexedIdentifier referenceIds, final AnnotationCount[] algs, final IntSet referencesToProcess, final int numAlignedReadsInSample) { for (final int referenceIndex : referencesToProcess) { final String chromosomeName = referenceIds.getId(referenceIndex).toString(); if (allAnnots.containsKey(chromosomeName)) { final ObjectList<AnnotationRPKM> annots = allAnnots.get(chromosomeName); algs[referenceIndex].sortReads(); algs[referenceIndex].getBaseCounter().accumulate(); algs[referenceIndex].getBaseCounter().baseCount(); for (final AnnotationRPKM annot : annots) { final int start = annot.getStart(); final int end = annot.getEnd(); final double overlapReads = algs[referenceIndex] .countReadsPartiallyOverlappingWithInterval(start, end); final double rpkm = calculateRPKM(overlapReads, annot.getLength(), numAlignedReadsInSample); annot.rpkm += rpkm; } } } } private double calculateRPKM(final double readCountInt, final int annotLength, final double geneOverlapReads) { final double readCount = readCountInt; final double length = annotLength; // in bases final double sampleReadCount = geneOverlapReads; // in read return readCount / (length / 1000.0f) / (sampleReadCount / 1E6f); } /** * Read tab delimited annotation file including 6 columns : chromosome, strand, transcriptID, * segmentID, start, end each row is a segment, read each chromosome at a time. * * @param annotFile * @return * @throws IOException */ public Object2ObjectMap<String, ObjectList<AnnotationRPKM>> readAnnotations(final String annotFile) throws IOException { BufferedReader reader = null; final Object2ObjectMap<String, AnnotationRPKM> annots = new Object2ObjectOpenHashMap<String, AnnotationRPKM>(); try { reader = new BufferedReader(new FileReader(annotFile)); String line; final String header = reader.readLine(); System.out.println(header); while ((line = reader.readLine()) != null) { if (!line.startsWith("#")) { final String[] linearray = line.trim().split("\t"); final String chromosome = linearray[0]; // if(!chromosome.equalsIgnoreCase(chroName)) continue; final String strand = linearray[1]; final String transcriptID = linearray[2]; final String exonID = linearray[3]; final int segmentStart = Integer.parseInt(linearray[4]); final int segmentEnd = Integer.parseInt(linearray[5]); final Segment segment = new Segment(segmentStart, segmentEnd, exonID, strand); if (annots.containsKey(transcriptID)) { annots.get(transcriptID).addSegment(segment); } else { final AnnotationRPKM annot = new AnnotationRPKM(transcriptID, chromosome, strand, 0d); annot.addSegment(segment); annots.put(transcriptID, annot); } } } } finally { IOUtils.closeQuietly(reader); } //organize the Annotations to chromosome final Object2ObjectMap<String, ObjectList<AnnotationRPKM>> allAnnots = new Object2ObjectOpenHashMap<String, ObjectList<AnnotationRPKM>>(); for (final Object2ObjectMap.Entry<String, AnnotationRPKM> entry : annots.object2ObjectEntrySet()) { final AnnotationRPKM annotationRPKM = entry.getValue(); annotationRPKM.sortSegments(); final String chromosome = annotationRPKM.getChromosome(); if (allAnnots.containsKey(chromosome)) { allAnnots.get(chromosome).add(annotationRPKM); } else { final ObjectList<AnnotationRPKM> annotations = new ObjectArrayList<AnnotationRPKM>(); annotations.add(annotationRPKM); allAnnots.put(chromosome, annotations); } } return allAnnots; } public static void writeRPKM(final String outputFileName, final ObjectList<AnnotationRPKM> annotationList, final boolean append) { PrintWriter writer = null; final File outputFile = new File(outputFileName); try { if (!outputFile.exists()) { writer = new PrintWriter(outputFile); // Write the file header writer.write( "Chromosome_Name\tStrand\tPrimary_ID\tSecondary_ID\tTranscript_Start\tTranscript_End\tAverage_RPKM\n"); } else { writer = new PrintWriter(new FileOutputStream(outputFile, append)); } final ObjectListIterator<AnnotationRPKM> annotIterator = annotationList.listIterator(); while (annotIterator.hasNext()) { final AnnotationRPKM annotation = annotIterator.next(); annotation.write(writer); } } catch (FileNotFoundException fnfe) { System.err.println("Caught exception in writeAnnotations: " + fnfe.getMessage()); System.exit(1); } finally { IOUtils.closeQuietly(writer); } } public static void writeAnnotations(final String outputFileName, final ObjectList<Annotation> annotationList, final boolean append) { PrintWriter writer = null; final File outputFile = new File(outputFileName); try { if (!outputFile.exists()) { writer = new PrintWriter(outputFile); // Write the file header writer.write( "Chromosome_Name\tStrand\tPrimary_ID\tSecondary_ID\tTranscript_Start\tTranscript_End\n"); } else { writer = new PrintWriter(new FileOutputStream(outputFile, append)); } final ObjectListIterator<Annotation> annotIterator = annotationList.listIterator(); while (annotIterator.hasNext()) { final Annotation annotation = annotIterator.next(); annotation.write(writer); } } catch (FileNotFoundException fnfe) { System.err.println("Caught exception in writeAnnotations: " + fnfe.getMessage()); System.exit(1); } finally { IOUtils.closeQuietly(writer); } } /** * Main method. * * @param args command line args. * @throws com.martiansoftware.jsap.JSAPException * error parsing * @throws java.io.IOException error parsing or executing. */ public static void main(final String[] args) throws JSAPException, IOException { new AggregatePeaksByRPKMDifferenceMode().configure(args).execute(); } }