Java tutorial
/* * Copyright (C) 2009-2011 Institute for Computational Biomedicine, * Weill Medical College of Cornell University * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.cornell.med.icb.goby.modes; import edu.cornell.med.icb.goby.R.FisherExact; import edu.cornell.med.icb.goby.R.GobyRengine; import edu.cornell.med.icb.goby.algorithmic.data.GroupComparison; import edu.cornell.med.icb.goby.alignments.*; import edu.cornell.med.icb.goby.readers.vcf.ColumnType; import edu.cornell.med.icb.goby.reads.RandomAccessSequenceInterface; import edu.cornell.med.icb.goby.stats.AbstractOutputFormat; import edu.cornell.med.icb.goby.stats.DifferentialExpressionAnalysis; import edu.cornell.med.icb.goby.stats.DifferentialExpressionCalculator; import edu.cornell.med.icb.goby.stats.VCFWriter; import edu.cornell.med.icb.goby.util.OutputInfo; import it.unimi.dsi.fastutil.ints.IntArraySet; import it.unimi.dsi.fastutil.ints.IntSet; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.lang.MutableString; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.rosuda.JRI.Rengine; import java.util.ArrayList; import java.util.Arrays; /** * A Variant Call Format output to compare genomic variations across groups. This format implements a fisher exact * test that compares the number of samples in each group that have each possible allele. This is the usual allelic * test of population genetics. Formally, we consider the matrix: * group-1 group-2 * allele=A C1A C2A * allele=C * allele=G * allele=T * allele=N C1N C2N * <p/> * and estimate the chance that the number of samples in this N x 2 matrix are non-randomly distributed. The Cij in * the matrix are the number of samples in each group i that have at least one count for the j allele. We use N to * indicate any non base allele (such as deletion in the reference N='-'). * * @author Fabien Campagne * Date: April 4 2011 * Time: 2:38:13 AM */ public class CompareGroupsVCFOutputFormat extends AbstractOutputFormat implements SequenceVariationOutputFormat { /** * Used to log debug and informational messages. */ private static final Log LOG = LogFactory.getLog(CompareGroupsVCFOutputFormat.class); private VCFWriter statWriter; private int refIdColumnIndex; private int positionColumnIndex; private boolean fisherRInstalled; private String[] groups; private String[] samples; private ObjectArrayList<ReadIndexStats> readIndexStats; private int log2OddsRatioColumnIndex[]; private int fisherExactPValueColumnIndex[]; private int numberOfGroups; private DifferentialExpressionAnalysis deAnalyzer; private DifferentialExpressionCalculator deCalculator; private int log2OddsRatioStandardErrorColumnIndex[]; private int log2OddsRatioZColumnIndex[]; private int[] readerIndexToGroupIndex; private int[] refCountsPerGroup; private int[] variantsCountPerGroup; private int[] distinctReadIndexCountPerGroup; private IntSet[] distinctReadIndicesCountPerGroup; private float[] averageVariantQualityScorePerGroup; private int[] variantsCountPerSample; private int[] refCountsPerSample; private int numberOfSamples; private int biomartFieldIndex; private GenotypesOutputFormat genotypeFormatter; private int depthFieldIndex; private int varCountsIndex[]; private int refCountsIndex[]; /** * The array used for testing, pointing to pre-allocated or dynamically allocated version. Allele count is defined * as the number of times a specific allele is observed in a group of samples (each sample can contribute at most * one to the count). */ private int[][] alleleCountsPerGroup; // pre-allocated array, when the site has no indels: private int[][] alleleCountsPerGroupBaseAllelesOnly; // pre-allocated vector, when the site has no indels: private int[] vectorBaseAllelesOnly; /** * The vector used for testing, pointing to pre-allocated or dynamically allocated version: */ private int[] fisherVector; /** * The number of alleles, for sites that have no indels: */ private final int numberOfAlleles = SampleCountInfo.BASE_MAX_INDEX; /** * The number of allele observed in group for the allele with most counts in the group. For instance if counts are * group0 group1 * allele=A 0 0 * allele=C 10 0 * allele=T 2 0 * allele=G 4 15 * allele=N 3 2 * <p/> * alleleCountGroupMax will contain {10,15} */ private int[] alleleCountGroupMax; /** * The vector that will be used for testing. */ private long[] fisherVectorUnderTest; /** * Index of the VCF fields that hold allele counts. */ private int[] alleleCountsGroupIndex; private MutableString[] alleleCountsMutableStrings; protected void setStatWriter(VCFWriter statWriter) { this.statWriter = statWriter; } ArrayList<GroupComparison> groupComparisons; public void defineColumns(final OutputInfo outputInfo, final DiscoverSequenceVariantsMode mode) { deAnalyzer = mode.getDiffExpAnalyzer(); deCalculator = mode.getDiffExpCalculator(); groups = mode.getGroups(); samples = mode.getSamples(); readerIndexToGroupIndex = mode.getReaderIndexToGroupIndex(); final ObjectArrayList<ReadIndexStats> readIndexStats = mode.getReadIndexStats(); this.statWriter = new VCFWriter(outputInfo.getPrintWriter()); //activate R only if we need it: final Rengine rEngine = GobyRengine.getInstance().getRengine(); fisherRInstalled = rEngine != null && rEngine.isAlive(); if (groups.length < 1) { System.err.println("CompareGroupsVCFOutputFormat requires at least one group."); System.exit(1); } groupComparisons = mode.getGroupComparisons(); this.readIndexStats = readIndexStats; log2OddsRatioColumnIndex = new int[groupComparisons.size()]; log2OddsRatioStandardErrorColumnIndex = new int[groupComparisons.size()]; log2OddsRatioZColumnIndex = new int[groupComparisons.size()]; fisherExactPValueColumnIndex = new int[groupComparisons.size()]; numberOfGroups = groups.length; biomartFieldIndex = statWriter.defineField("INFO", "BIOMART_COORDS", 1, ColumnType.String, "Coordinates for use with Biomart.", "biomart"); genotypeFormatter.defineInfoFields(statWriter); for (GroupComparison comparison : groupComparisons) { log2OddsRatioColumnIndex[comparison.index] = statWriter.defineField("INFO", String.format("LOD[%s/%s]", comparison.nameGroup1, comparison.nameGroup2), 1, ColumnType.Float, String.format("Log2 of the odds-ratio of observing a variant in group %s versus group %s", comparison.nameGroup1, comparison.nameGroup2)); log2OddsRatioStandardErrorColumnIndex[comparison.index] = statWriter.defineField("INFO", String.format("LOD_SE[%s/%s]", comparison.nameGroup1, comparison.nameGroup2), 1, ColumnType.Float, String.format("Standard Error of the log2 of the odds-ratio between group %s and group %s", comparison.nameGroup1, comparison.nameGroup2)); log2OddsRatioZColumnIndex[comparison.index] = statWriter.defineField("INFO", String.format("LOD_Z[%s/%s]", comparison.nameGroup1, comparison.nameGroup2), 1, ColumnType.Float, String.format("Z value of the odds-ratio of observing a variant in group %s versus group %s", comparison.nameGroup1, comparison.nameGroup2)); fisherExactPValueColumnIndex[comparison.index] = statWriter.defineField("INFO", String.format("FisherP[%s/%s]", comparison.nameGroup1, comparison.nameGroup2), 1, ColumnType.Float, String.format( "Fisher exact P-value that the allelic frequencies (each sample contributes at least one toward the group count of each allele) differ between group %s and group %s.", comparison.nameGroup1, comparison.nameGroup2)); } for (int i = 0; i < numberOfGroups; i++) { alleleCountsGroupIndex[i] = statWriter.defineField("INFO", String.format("AlleleCountsInGroup[%s]", groups[i]), 1, ColumnType.String, String.format( "Count of specific alleles in group %s, in the format (allele=count[,])+. Allele count is defined " + " as the number of times a specific allele is observed in a group of samples (each sample can contribute at most" + " one to the count)", groups[i])); } varCountsIndex = new int[groups.length]; refCountsIndex = new int[groups.length]; for (int groupIndex = 0; groupIndex < groups.length; groupIndex++) { refCountsIndex[groupIndex] = statWriter.defineField("INFO", String.format("refCountGroup[%s]", groups[groupIndex]), 1, ColumnType.Integer, String.format("Number of reference allele called in group %s.", groups[groupIndex])); varCountsIndex[groupIndex] = statWriter.defineField("INFO", String.format("varCountGroup[%s]", groups[groupIndex]), 1, ColumnType.Integer, String.format("Number of variant allele(s) called in group %s.", groups[groupIndex])); } depthFieldIndex = statWriter.defineField("INFO", "DP", 1, ColumnType.Integer, "Total depth of sequencing across groups at this site"); statWriter.defineSamples(samples); genotypeFormatter.defineGenotypeField(statWriter); statWriter.writeHeader(); } @Override public void allocateStorage(final int numberOfSamples, final int numberOfGroups) { this.numberOfGroups = numberOfGroups; this.numberOfSamples = numberOfSamples; refCountsPerGroup = new int[numberOfGroups]; variantsCountPerGroup = new int[numberOfGroups]; distinctReadIndexCountPerGroup = new int[numberOfGroups]; averageVariantQualityScorePerGroup = new float[numberOfGroups]; alleleCountsGroupIndex = new int[numberOfGroups]; alleleCountsMutableStrings = new MutableString[numberOfGroups]; for (int i = 0; i < numberOfGroups; i++) { alleleCountsMutableStrings[i] = new MutableString(); } // used for allelic fisher exact test: alleleCountsPerGroupBaseAllelesOnly = new int[numberOfAlleles][numberOfGroups]; vectorBaseAllelesOnly = new int[numberOfAlleles * numberOfGroups]; refCountsPerSample = new int[numberOfSamples]; variantsCountPerSample = new int[numberOfSamples]; distinctReadIndicesCountPerGroup = new IntSet[numberOfGroups]; for (int i = 0; i < numberOfGroups; i++) { distinctReadIndicesCountPerGroup[i] = new IntArraySet(); } genotypeFormatter = new GenotypesOutputFormat(); genotypeFormatter.allocateStorage(numberOfSamples, numberOfGroups); } public void writeRecord(final DiscoverVariantIterateSortedAlignments iterator, final SampleCountInfo[] sampleCounts, final int referenceIndex, int position, final DiscoverVariantPositionData list, final int groupIndexA, final int groupIndexB) { // report 1-based positions position = position + 1; int totalCount = 0; int maxGenotypeIndexAcrossSamples = 0; for (int i = 0; i < numberOfGroups; i++) { alleleCountsMutableStrings[i].setLength(0); } for (int sampleIndex = 0; sampleIndex < numberOfSamples; sampleIndex++) { final SampleCountInfo sci = sampleCounts[sampleIndex]; int sumInSample = 0; //for (final int baseCount : sci.counts) { final int maxGenotypeIndex = sci.getGenotypeMaxIndex(); maxGenotypeIndexAcrossSamples = Math.max(maxGenotypeIndex, maxGenotypeIndexAcrossSamples); for (int genotypeIndex = 0; genotypeIndex < maxGenotypeIndex; genotypeIndex++) { final int genotypeCount = sci.getGenotypeCount(genotypeIndex); totalCount += genotypeCount; sumInSample += genotypeCount; assert genotypeCount >= 0 : "counts must not be negative."; } // must observe at least one base in each sample to write output for this position. if (sumInSample == 0) { return; } } if (totalCount == 0) { return; } statWriter.setInfo(depthFieldIndex, totalCount); fillVariantCountArrays(sampleCounts, maxGenotypeIndexAcrossSamples); // since genotypes have been aligned across all samples, we use the first one to find the genotype string // corresponding to each index: final int sampleIndex = 0; final SampleCountInfo sci = sampleCounts[sampleIndex]; for (int groupIndex = 0; groupIndex < numberOfGroups; groupIndex++) { for (int genotypeIndex = 0; genotypeIndex < maxGenotypeIndexAcrossSamples; genotypeIndex++) { // build the alleleCount string for each group: alleleCountsMutableStrings[groupIndex].append(sci.getGenotypeString(genotypeIndex)); alleleCountsMutableStrings[groupIndex].append('='); alleleCountsMutableStrings[groupIndex].append(alleleCountsPerGroup[genotypeIndex][groupIndex]); alleleCountsMutableStrings[groupIndex].append(','); } } final CharSequence currentReferenceId = iterator.getReferenceId(referenceIndex); statWriter.setChromosome(currentReferenceId); statWriter.setPosition(position); statWriter.setReferenceAllele(Character.toString(sampleCounts[0].referenceBase)); // construct a biomart region span in the format chr:pos1:chr:pos final String biomartRegionSpan = String.format("%s:%s:%s", currentReferenceId, position, position); statWriter.setInfo(biomartFieldIndex, biomartRegionSpan); for (int groupIndex = 0; groupIndex < numberOfGroups; groupIndex++) { // clip the trailing ',' alleleCountsMutableStrings[groupIndex].setLength(alleleCountsMutableStrings[groupIndex].length() - 1); // set the value to the VCFWriter: statWriter.setInfo(alleleCountsGroupIndex[groupIndex], alleleCountsMutableStrings[groupIndex]); statWriter.setInfo(refCountsIndex[groupIndex], refCountsPerGroup[groupIndex]); statWriter.setInfo(varCountsIndex[groupIndex], variantsCountPerGroup[groupIndex]); } for (final GroupComparison comparison : groupComparisons) { final int indexGroup1 = comparison.indexGroup1; final int indexGroup2 = comparison.indexGroup1; final double denominator = (double) (refCountsPerGroup[indexGroup1] + 1) * (double) (variantsCountPerGroup[indexGroup2] + 1); double oddsRatio = denominator == 0 ? Double.NaN : ((double) (refCountsPerGroup[indexGroup2] + 1) * (double) (variantsCountPerGroup[indexGroup1] + 1)) / denominator; final double logOddsRatioSE; if (variantsCountPerGroup[indexGroup1] < 10 || variantsCountPerGroup[indexGroup2] < 10 || refCountsPerGroup[indexGroup1] < 10 || refCountsPerGroup[indexGroup2] < 10) { // standard error estimation is unreliable when any of the counts are less than 10. logOddsRatioSE = Double.NaN; } else { logOddsRatioSE = Math .sqrt(1d / refCountsPerGroup[indexGroup2] + 1d / variantsCountPerGroup[indexGroup1] + 1d / variantsCountPerGroup[indexGroup2] + 1d / refCountsPerGroup[indexGroup1]); } double log2OddsRatio = Math.log(oddsRatio) / Math.log(2); final double log2OddsRatioZValue = log2OddsRatio / logOddsRatioSE; double fisherP = Double.NaN; if (fisherRInstalled) { updateFisherVector(maxGenotypeIndexAcrossSamples, comparison); if (checkCounts()) { final FisherExact.Result result = FisherExact.fexact(fisherVector, maxGenotypeIndexAcrossSamples, 2, FisherExact.AlternativeHypothesis.twosided, true); fisherP = result.getPValue(); /* // print the counts and p-value: final IntArrayList wrap = IntArrayList.wrap(fisherVector); System.out.printf("fisherVector %s/%s%n" + "[0-%d] %s%n" + "[%d-%d] %s%n" + "p-value= %g%n%n", comparison.nameGroup1, comparison.nameGroup2, maxGenotypeIndexAcrossSamples, wrap.subList(0, maxGenotypeIndexAcrossSamples), maxGenotypeIndexAcrossSamples + 1, fisherVector.length, wrap.subList(maxGenotypeIndexAcrossSamples, wrap.size()), fisherP);*/ } } statWriter.setInfo(log2OddsRatioColumnIndex[comparison.index], log2OddsRatio); statWriter.setInfo(log2OddsRatioStandardErrorColumnIndex[comparison.index], logOddsRatioSE); statWriter.setInfo(log2OddsRatioZColumnIndex[comparison.index], log2OddsRatioZValue); statWriter.setInfo(fisherExactPValueColumnIndex[comparison.index], fisherP); } genotypeFormatter.writeGenotypes(statWriter, sampleCounts, position); statWriter.writeRecord(); } int[] cacheFisherVector; private void updateFisherVector(int maxGenotypeIndexAcrossSamples, GroupComparison comparison) { if (maxGenotypeIndexAcrossSamples == SampleCountInfo.BASE_MAX_INDEX) { // reuse the vector if we have only SNPs at that site: if (cacheFisherVector == null) { cacheFisherVector = new int[SampleCountInfo.BASE_MAX_INDEX * 2]; } fisherVector = cacheFisherVector; Arrays.fill(fisherVector, 0); } else { // allocate a new custom-size array if the site has indels: fisherVector = new int[maxGenotypeIndexAcrossSamples * 2]; } // reorder allelic counts for fisher exact test for this specific pair: int j = 0; for (int genotypeIndex = 0; genotypeIndex < maxGenotypeIndexAcrossSamples; genotypeIndex++) { fisherVector[j] = alleleCountsPerGroup[genotypeIndex][comparison.indexGroup1]; ++j; } for (int genotypeIndex = 0; genotypeIndex < maxGenotypeIndexAcrossSamples; genotypeIndex++) { fisherVector[j] = alleleCountsPerGroup[genotypeIndex][comparison.indexGroup2]; ++j; } } public void close() { statWriter.close(); } @Override public void setGenome(RandomAccessSequenceInterface genome) { } @Override public void setGenomeReferenceIndex(int index) { } private void fillVariantCountArrays(final SampleCountInfo[] sampleCounts, int maxGenotypeIndexAcrossSamples) { Arrays.fill(variantsCountPerGroup, 0); Arrays.fill(refCountsPerGroup, 0); Arrays.fill(distinctReadIndexCountPerGroup, 0); if (maxGenotypeIndexAcrossSamples == SampleCountInfo.BASE_MAX_INDEX) { // reuse the vector if we have only SNPs at that site: alleleCountsPerGroup = alleleCountsPerGroupBaseAllelesOnly; for (int alleleIndex = 0; alleleIndex < numberOfAlleles; alleleIndex++) { Arrays.fill(alleleCountsPerGroup[alleleIndex], 0); } } else { // allocate a new custom-size array if the site has indels: alleleCountsPerGroup = new int[maxGenotypeIndexAcrossSamples][numberOfGroups]; for (int genotypeIndex = 0; genotypeIndex < maxGenotypeIndexAcrossSamples; genotypeIndex++) { alleleCountsPerGroup[genotypeIndex] = new int[numberOfGroups]; } } for (int groupIndex = 0; groupIndex < numberOfGroups; groupIndex++) { distinctReadIndicesCountPerGroup[groupIndex].clear(); } for (final SampleCountInfo csi : sampleCounts) { final int sampleIndex = csi.sampleIndex; variantsCountPerSample[sampleIndex] = csi.varCount; refCountsPerSample[sampleIndex] = csi.refCount; final int groupIndex = readerIndexToGroupIndex[sampleIndex]; variantsCountPerGroup[groupIndex] += csi.varCount; refCountsPerGroup[groupIndex] += csi.refCount; distinctReadIndicesCountPerGroup[groupIndex].addAll(csi.distinctReadIndices); for (int genotypeIndex = 0; genotypeIndex < maxGenotypeIndexAcrossSamples; genotypeIndex++) { final int alleleCount = csi.getGenotypeCount(genotypeIndex) > 0 ? 1 : 0; alleleCountsPerGroup[genotypeIndex][groupIndex] += alleleCount; } } //fisher.test(matrix(c(10,10,10,0,0,10,10,10,0,0),5,2), hybrid="TRUE") for (int groupIndex = 0; groupIndex < numberOfGroups; groupIndex++) { distinctReadIndexCountPerGroup[groupIndex] = distinctReadIndicesCountPerGroup[groupIndex].size(); } } private boolean checkCounts() { boolean ok = true; int totalCount = 0; // detect if any count is negative for (final int count : fisherVector) { if (count < 0) ok = false; totalCount += count; } // skip fisher if all counts are zero: if (totalCount == 0) return false; return ok; } }