Java tutorial
/* * Copyright (C) 2009-2010 Institute for Computational Biomedicine, * Weill Medical College of Cornell University * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.cornell.med.icb.goby.modes; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import edu.cornell.med.icb.goby.algorithmic.data.DistinctIntValueCounterBitSet; import edu.cornell.med.icb.goby.alignments.AlignmentReaderImpl; import edu.cornell.med.icb.goby.alignments.AlignmentTooManyHitsReader; import edu.cornell.med.icb.goby.alignments.Alignments; import edu.cornell.med.icb.goby.alignments.EntryFlagHelper; import edu.cornell.med.icb.goby.reads.Reads; import edu.cornell.med.icb.goby.reads.ReadsReader; import edu.cornell.med.icb.goby.util.FileExtensionHelper; import edu.cornell.med.icb.identifier.IndexedIdentifier; import it.unimi.dsi.fastutil.doubles.DoubleArrayList; import it.unimi.dsi.lang.MutableString; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.math.stat.descriptive.SummaryStatistics; import org.apache.commons.math.stat.descriptive.rank.Percentile; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.*; /** * Display some basic statistics on file in compact reads format. * * @author Kevin Dorff */ public class CompactFileStatsMode extends AbstractGobyMode { /** * The mode name. */ private static final String MODE_NAME = "compact-file-stats"; /** * The mode description help text. */ private static final String MODE_DESCRIPTION = "Display some basic statistics on compact-reads and compact-alignment files."; /** * The input files. */ private final List<File> inputFiles = new LinkedList<File>(); /** * The minimum read length across all files. */ private int minReadLength = Integer.MAX_VALUE; /** * The maximum read length across all files. */ private int maxReadLength = Integer.MIN_VALUE; /** * The minimum quality length across all files. */ private int minQualityLength = Integer.MAX_VALUE; /** * The maximum quality length across all files. */ private int maxQualityLength = Integer.MIN_VALUE; /** * The cumulative read length across all files. */ private long cumulativeReadLength; /** * The number of reads across all files. */ private long numberOfReads; /** * Whether or not to compute quantile information. */ private boolean computeQuantiles; /** * Number of quantiles used to characterize read length distribution. */ private int numberOfQuantiles = 1; /** * Display verbose output. */ private boolean verbose; /** * The output filename or null if we should write to stdout. */ private String outputFilename; List<Boolean> pairedSamples = null; /** * The actual writer to use to write the output. */ private PrintStream stream; private String type; private boolean headerOnly; @Override public String getModeName() { return MODE_NAME; } @Override public String getModeDescription() { return MODE_DESCRIPTION; } /** * Configure. * * @param args command line arguments * @return this object for chaining * @throws IOException error parsing * @throws JSAPException error parsing */ @Override public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException { final JSAPResult jsapResult = parseJsapArguments(args); reset(); final File[] inputFilesArray = jsapResult.getFileArray("input"); outputFilename = jsapResult.getString("output"); inputFiles.addAll(Arrays.asList(inputFilesArray)); computeQuantiles = jsapResult.userSpecified("number-of-quantiles"); numberOfQuantiles = jsapResult.getInt("number-of-quantiles", 1); verbose = jsapResult.getBoolean("verbose"); type = jsapResult.getString("type"); headerOnly = jsapResult.getBoolean("header-only"); return this; } /** * Reset the input file lists and cumulative statistics to default values. */ public void reset() { inputFiles.clear(); minReadLength = Integer.MAX_VALUE; maxReadLength = Integer.MIN_VALUE; numberOfReads = 0; cumulativeReadLength = 0; } /** * Run the FileStats mode. * * @throws IOException error reading / writing */ @Override public void execute() throws IOException { stream = null; try { stream = outputFilename == null ? System.out : new PrintStream(new FileOutputStream(outputFilename)); int numberOfFilesProcessed = 0; pairedSamples = new ArrayList<Boolean>(inputFiles.size()); FileExtensionHelper.CompactFileType fileType; if (type == null) { for (File file : inputFiles) { fileType = FileExtensionHelper.determineCompactFileType(file); switch (fileType) { case alignment: describeCompactAlignment(file); numberOfFilesProcessed++; break; case reads: describeCompactReads(file); numberOfFilesProcessed++; break; case unknown: default: System.err.println("Unknown file type: " + file); break; } } } else { fileType = FileExtensionHelper.CompactFileType.valueOf(type); for (final File file : inputFiles) { switch (fileType) { case alignment: describeCompactAlignment(file); numberOfFilesProcessed++; break; case reads: describeCompactReads(file); numberOfFilesProcessed++; break; } } } stream.println(); stream.printf("Total number of files processed = %,d\n", numberOfFilesProcessed); stream.printf("All Compact-Reads files were paired-end = %s\n", isAllPairedSamples()); stream.println(); } finally { if (stream != System.out) { IOUtils.closeQuietly(stream); } stream = null; } } /** * Print statistics about an alignment file in the Goby compact form. * * @param file The file to display statistics about * @throws IOException if the file cannot be read */ private void describeCompactAlignment(final File file) throws IOException { final String basename = AlignmentReaderImpl.getBasename(file.toString()); stream.printf("Compact Alignment basename = %s%n", basename); final AlignmentReaderImpl reader = new AlignmentReaderImpl(basename); reader.readHeader(); stream.println("Info from header:"); stream.printf("Alignment written with Goby version=%s %n", reader.getGobyVersion()); stream.printf("Alignment produced by aligner=%s version=%s %n", reader.getAlignerName(), reader.getAlignerVersion()); stream.printf("Sorted: %b%n", reader.isSorted()); stream.printf("Indexed: %b%n", reader.isIndexed()); stream.printf("Number of target sequences = %,d%n", reader.getNumberOfTargets()); final int[] targetLengthsFromHeader = reader.getTargetLength(); stream.printf("Number of target length entries = %,d%n", ArrayUtils.getLength(reader.getTargetLength())); stream.printf("smallestSplitQueryIndex = %d%n", reader.getSmallestSplitQueryIndex()); stream.printf("largestSplitQueryIndex = %d%n", reader.getLargestSplitQueryIndex()); // simple statistics for target lengths final SummaryStatistics targetLengthStats = new SummaryStatistics(); if (targetLengthsFromHeader != null) { for (final double d : targetLengthsFromHeader) { targetLengthStats.addValue(d); } } stream.printf("Min target length = %,d%n", (int) targetLengthStats.getMin()); stream.printf("Max target length = %,d%n", (int) targetLengthStats.getMax()); stream.printf("Mean target length = %,.2f%n", targetLengthStats.getMean()); stream.println(); stream.printf("Number of query sequences = %,d%n", reader.getNumberOfQueries()); final SummaryStatistics queryLengthStats = new SummaryStatistics(); stream.println("Query lengths stored in entries = " + reader.isQueryLengthStoredInEntries()); stream.println("Constant query lengths = " + reader.isConstantQueryLengths()); stream.printf("Has query identifiers = %s%n", reader.getQueryIdentifiers() != null && !reader.getQueryIdentifiers().isEmpty()); final IndexedIdentifier targetIdentifiers = reader.getTargetIdentifiers(); final boolean hasTargetIdentifiers = targetIdentifiers != null && !targetIdentifiers.isEmpty(); stream.printf("Has target identifiers = %s%n", hasTargetIdentifiers); stream.printf("Has query index permutation = %s%n", reader.getQueryIndicesWerePermuted()); stream.printf("Has query index occurrences = %s%n", reader.hasQueryIndexOccurrences()); stream.printf("Has all read quality scores = %s%n", reader.getHasAllReadQualityScores()); stream.printf("Has ambiguity = %s%n", reader.hasAmbiguity()); if (verbose) { if (hasTargetIdentifiers) { for (Map.Entry<MutableString, Integer> entry : targetIdentifiers.entrySet()) { stream.printf(" Target %s=%d with a length of %d%n", entry.getKey(), entry.getValue(), targetLengthsFromHeader[entry.getValue()]); } } else { for (Map.Entry<MutableString, Integer> entry : targetIdentifiers.entrySet()) { stream.printf(" Target %d with a length of %d%n", entry.getValue(), targetLengthsFromHeader[entry.getValue()]); } } } stream.println(); if (reader.getReadOriginInfo().size() > 0) { stream.println("---- Read Origin Info ------"); for (final Alignments.ReadOriginInfo info : reader.getReadOriginInfo().getPbList()) { stream.println("["); stream.print(info.toString()); stream.println("]"); } } else { stream.println("Alignment has no Read Origin Info/Read Groups"); } if (headerOnly) return; // the query indices that aligned. Includes those final DistinctIntValueCounterBitSet alignedQueryIndices = new DistinctIntValueCounterBitSet(); describeAmbigousReads(basename, reader.getNumberOfQueries(), alignedQueryIndices); int maxQueryIndex = -1; int maxTargetIndex = -1; int numEntries = 0; long numLogicalAlignmentEntries = 0; long total = 0; double avgScore = 0; int sumNumVariations = 0; int numPaired = 0; int numProperlyPaired = 0; int numFirstInPair = 0; int numSecondInPair = 0; boolean hasSoftClips = false; for (final Alignments.AlignmentEntry entry : reader) { numberOfReads++; // Across all files numEntries++; // Across this file numLogicalAlignmentEntries += Math.max(entry.getMultiplicity(), 1); total += entry.getQueryAlignedLength(); avgScore += entry.getScore(); maxQueryIndex = Math.max(maxQueryIndex, entry.getQueryIndex()); maxTargetIndex = Math.max(maxTargetIndex, entry.getTargetIndex()); cumulativeReadLength += entry.getQueryAlignedLength(); minReadLength = Math.min(minReadLength, entry.getQueryAlignedLength()); maxReadLength = Math.max(maxReadLength, entry.getQueryAlignedLength()); sumNumVariations += entry.getSequenceVariationsCount(); alignedQueryIndices.observe(entry.getQueryIndex()); hasSoftClips |= entry.hasSoftClippedBasesLeft(); hasSoftClips |= entry.hasSoftClippedBasesRight(); // check entry then header for the query length final double queryLength = entry.getQueryLength(); queryLengthStats.addValue(queryLength); numPaired += EntryFlagHelper.isPaired(entry) ? 1 : 0; numProperlyPaired += EntryFlagHelper.isProperlyPaired(entry) ? 1 : 0; numFirstInPair += EntryFlagHelper.isFirstInPair(entry) ? 1 : 0; numSecondInPair += EntryFlagHelper.isSecondInPair(entry) ? 1 : 0; } avgScore /= (double) numLogicalAlignmentEntries; final int numQuerySequences = reader.getNumberOfQueries(); stream.printf("num query indices = %,d%n", numQuerySequences); final int numTargetSequences = maxTargetIndex + 1; final double avgNumVariationsPerQuery = ((double) sumNumVariations) / (double) numQuerySequences; stream.printf("num target indices = %,d%n", numTargetSequences); stream.printf("Number of alignment entries = %,d%n", numLogicalAlignmentEntries); stream.printf("Number of query indices that matched = %,d%n", alignedQueryIndices.count()); stream.printf("Percent matched = %4.1f %% %n", (double) alignedQueryIndices.count() / (double) ((long) numQuerySequences) * 100.0d); stream.printf("Avg query alignment length = %,f%n", numEntries > 0 ? divide(total, numEntries) : -1); stream.printf("Avg score alignment = %f%n", avgScore); stream.printf("Avg number of variations per query sequence = %3.2f %n", avgNumVariationsPerQuery); // size, the number of bytes in the entries file. final long size = new File(basename + ".entries").length(); stream.printf("Average bytes per entry = %f%n", divide(size, numLogicalAlignmentEntries)); stream.printf("Min query length = %,d%n", (int) queryLengthStats.getMin()); stream.printf("Max query length = %,d%n", (int) queryLengthStats.getMax()); final double meanQueryLength = queryLengthStats.getMean(); stream.printf("Mean query length = %,.2f%n", meanQueryLength); final int averageReadLength = (int) (Math.round(meanQueryLength)); stream.printf("Average bits per read base, assuming average read length %d = %f%n", averageReadLength, divide(size, numLogicalAlignmentEntries * averageReadLength)); stream.printf("Percent paired reads = %,.2f %% %n", divide(numPaired, numQuerySequences * 2) * 100d); stream.printf("Percent properly paired reads = %,.2f %% %n", divide(numProperlyPaired, numQuerySequences * 2) * 100d); stream.printf("Percent first in pair = %,.2f %% %n", divide(numFirstInPair, numEntries) * 100d); stream.printf("Percent second in pair = %,.2f %% %n", divide(numSecondInPair, numEntries) * 100d); stream.printf("Aligment entries have some softClips: %b %n", hasSoftClips); } private double divide(final long a, final long b) { return ((double) a) / (double) b; } private void describeAmbigousReads(final String basename, final double numReads, final DistinctIntValueCounterBitSet queryIndices) { try { final AlignmentTooManyHitsReader tmhReader = new AlignmentTooManyHitsReader(basename); queryIndices.observe(tmhReader.getQueryIndices()); stream.printf("TMH: aligner threshold = %,d%n", tmhReader.getAlignerThreshold()); stream.printf("TMH: number of ambiguous matches = %,d%n", tmhReader.getQueryIndices().size()); stream.printf("TMH: %%ambiguous matches = %f %%%n", (tmhReader.getQueryIndices().size() * 100f) / numReads); } catch (IOException e) { stream.println("Cannot read TMH file for basename " + basename); } } /** * Print statistics about a reads file in the Goby compact form. * * @param file The file to display statistics about * @throws IOException if the file cannot be read */ private void describeCompactReads(final File file) throws IOException { stream.printf("Compact reads filename = %s%n", file); // keep the read lengths for computing quantiles final DoubleArrayList readLengths = new DoubleArrayList(); int minLength = Integer.MAX_VALUE; int maxLength = Integer.MIN_VALUE; int numberOfIdentifiers = 0; int numberOfDescriptions = 0; int numberOfSequences = 0; int numberOfSequencePairs = 0; int numberOfQualityScores = 0; int numberOfQualityScorePairs = 0; long totalReadLength = 0; long totalReadLengthPair = 0; final DistinctIntValueCounterBitSet allQueryIndices = new DistinctIntValueCounterBitSet(); ReadsReader reader = null; boolean checkedForPaired = false; try { final long size = file.length(); reader = new ReadsReader(FileUtils.openInputStream(file)); for (final Reads.ReadEntry entry : reader) { final int readLength = entry.getReadLength(); for (int i = 0; i < entry.getMetaDataCount(); i++) { Reads.MetaData metaData = entry.getMetaData(i); stream.printf("meta-data key=%s value=%s%n", metaData.getKey(), metaData.getValue()); } // across this file allQueryIndices.observe(entry.getReadIndex()); totalReadLength += readLength; totalReadLengthPair += entry.getReadLengthPair(); // across all files numberOfReads++; numberOfDescriptions += entry.hasDescription() ? 1 : 0; cumulativeReadLength += readLength; if (verbose && entry.hasDescription()) { stream.println("Description found: " + entry.getDescription()); } numberOfIdentifiers += entry.hasReadIdentifier() ? 1 : 0; if (verbose && entry.hasReadIdentifier()) { stream.printf("Identifier found: %s / size=%,d%n", entry.getReadIdentifier(), readLength); } numberOfSequences += entry.hasSequence() && !entry.getSequence().isEmpty() ? 1 : 0; final boolean samplePaired = entry.hasSequencePair() && !entry.getSequencePair().isEmpty(); if (samplePaired) { numberOfSequencePairs += 1; } if (!checkedForPaired) { // Check only the very first entry. checkedForPaired = true; pairedSamples.add(samplePaired); } if (entry.hasQualityScores() && !entry.getQualityScores().isEmpty()) { numberOfQualityScores += 1; final int qualityLength = entry.getQualityScores().size(); minQualityLength = Math.min(minQualityLength, qualityLength); maxQualityLength = Math.max(maxQualityLength, qualityLength); } numberOfQualityScorePairs += entry.hasQualityScoresPair() && !entry.getQualityScoresPair().isEmpty() ? 1 : 0; // we only need to keep all the read lengths if quantiles are being computed if (computeQuantiles) { readLengths.add(readLength); } minLength = Math.min(minLength, readLength); maxLength = Math.max(maxLength, readLength); // adjust the min/max length of across all files minReadLength = Math.min(minReadLength, readLength); maxReadLength = Math.max(maxReadLength, readLength); } stream.printf("Average bytes per entry: %f%n", divide(size, allQueryIndices.count())); stream.printf("Average bytes per base: %f%n", divide(size, cumulativeReadLength)); } finally { if (reader != null) { reader.close(); } } final int numReadEntries = allQueryIndices.count(); stream.printf("Has identifiers = %s (%,d) %n", numberOfIdentifiers > 0, numberOfIdentifiers); stream.printf("Has descriptions = %s (%,d) %n", numberOfDescriptions > 0, numberOfDescriptions); stream.printf("Has sequences = %s (%,d) %n", numberOfSequences > 0, numberOfSequences); stream.printf("Has sequencePairs = %s (%,d) %n", numberOfSequencePairs > 0, numberOfSequencePairs); stream.printf("Has quality scores = %s (%,d) %n", numberOfQualityScores > 0, numberOfQualityScores); stream.printf("Has quality score Pairs = %s (%,d) %n", numberOfQualityScorePairs > 0, numberOfQualityScorePairs); stream.printf("Number of entries = %,d%n", numReadEntries); stream.printf("Min read length = %,d%n", numReadEntries > 0 ? minLength : 0); stream.printf("Max read length = %,d%n", numReadEntries > 0 ? maxLength : 0); stream.printf("Min quality length = %,d%n", numberOfQualityScores > 0 ? minQualityLength : 0); stream.printf("Max quality length = %,d%n", numberOfQualityScores > 0 ? maxQualityLength : 0); stream.printf("Avg read length = %,d%n", numReadEntries > 0 ? totalReadLength / numReadEntries : 0); stream.printf("Avg read pair length = %,d%n", numReadEntries > 0 ? totalReadLengthPair / numReadEntries : 0); // compute quantiles if (computeQuantiles) { final Percentile percentile = new Percentile(); final double[] increasingReadLengths = readLengths.toDoubleArray(); Arrays.sort(increasingReadLengths); stream.printf("Read length quantiles = [ "); for (int quantile = 1; quantile < numberOfQuantiles + 1; quantile++) { stream.printf("%,f ", percentile.evaluate(increasingReadLengths, quantile)); } stream.printf("]%n"); } } /** * Get the maximum read length across all files processed so far. * * @return The the maximum read length */ public int getMaxReadLength() { return maxReadLength; } /** * Get the minimum read length across all files processed so far. * * @return The the minimum read length */ public int getMinReadLength() { return minReadLength; } /** * Get if all the files scanned contains paired samples (only checks the FIRST sample in each file). * * @return if all the files contained paired samples */ public boolean isAllPairedSamples() { boolean allPairedSamples = true; if (pairedSamples == null || pairedSamples.size() == 0) { allPairedSamples = false; } else { for (boolean pairedSample : pairedSamples) { if (!pairedSample) { allPairedSamples = false; break; } } } return allPairedSamples; } /** * A list if which of the scanned samples were paired and which weren't. Note that * the pairing scan only checks the FIRST sample in the reads file. * * @return the list of which scanned samples were paired (one per file scanned). */ public List<Boolean> getPairedSamples() { return pairedSamples; } /** * Get the cumulative length of the reads processed so far. * * @return The total number of reads. */ public long getCumulativeReadLength() { return cumulativeReadLength; } /** * Get the number of reads processed so far. * * @return The total number of reads. */ public long getNumberOfReads() { return numberOfReads; } /** * Get the list of files (reads/alignments) to process. * * @return The list of files. */ public List<File> getInputFiles() { return inputFiles; } /** * Add the specified file to the list of files to process. * * @param inputFile The file to process */ public void addInputFile(final File inputFile) { inputFiles.add(inputFile); } /** * Main method. * * @param args command line args. * @throws JSAPException error parsing * @throws java.io.IOException error parsing or executing. */ public static void main(final String[] args) throws JSAPException, IOException { new CompactFileStatsMode().configure(args).execute(); } }