Java tutorial
/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.utils.variant; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broad.tribble.TribbleException; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFConstants; import java.io.Serializable; import java.util.*; public class GATKVariantContextUtils { private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); public static final int DEFAULT_PLOIDY = 2; public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); public final static String MERGE_FILTER_PREFIX = "filterIn"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_INTERSECTION = "Intersection"; public enum GenotypeMergeType { /** * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. */ UNIQUIFY, /** * Take genotypes in priority order (see the priority argument). */ PRIORITIZE, /** * Take the genotypes in any order. */ UNSORTED, /** * Require that all samples/genotypes be unique between all inputs. */ REQUIRE_UNIQUE } public enum FilteredRecordMergeType { /** * Union - leaves the record if any record is unfiltered. */ KEEP_IF_ANY_UNFILTERED, /** * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. */ KEEP_IF_ALL_UNFILTERED, /** * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. */ KEEP_UNCONDITIONAL } public enum MultipleAllelesMergeType { /** * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. */ BY_TYPE, /** * Merge all allele types at the same start position into the same VCF record. */ MIX_TYPES } /** * Refactored out of the AverageAltAlleleLength annotation class * @param vc the variant context * @return the average length of the alt allele (a double) */ public static double getMeanAltAlleleLength(VariantContext vc) { double averageLength = 1.0; if (!vc.isSNP() && !vc.isSymbolic()) { // adjust for the event length int averageLengthNum = 0; int averageLengthDenom = 0; int refLength = vc.getReference().length(); for (Allele a : vc.getAlternateAlleles()) { int numAllele = vc.getCalledChrCount(a); int alleleSize; if (a.length() == refLength) { // SNP or MNP byte[] a_bases = a.getBases(); byte[] ref_bases = vc.getReference().getBases(); int n_mismatch = 0; for (int idx = 0; idx < a_bases.length; idx++) { if (a_bases[idx] != ref_bases[idx]) n_mismatch++; } alleleSize = n_mismatch; } else if (a.isSymbolic()) { alleleSize = 1; } else { alleleSize = Math.abs(refLength - a.length()); } averageLengthNum += alleleSize * numAllele; averageLengthDenom += numAllele; } averageLength = ((double) averageLengthNum) / averageLengthDenom; } return averageLength; } /** * create a genome location, given a variant context * @param genomeLocParser parser * @param vc the variant context * @return the genomeLoc */ public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser, VariantContext vc) { return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); } public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { if (!context.isSNP() || !context.isBiallelic()) throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); } /** * If this is a BiAllelic SNP, is it a transition? */ public static boolean isTransition(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; } /** * If this is a BiAllelic SNP, is it a transversion? */ public static boolean isTransversion(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; } public static boolean isTransition(Allele ref, Allele alt) { return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; } public static boolean isTransversion(Allele ref, Allele alt) { return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; } /** * Returns a context identical to this with the REF and ALT alleles reverse complemented. * * @param vc variant context * @return new vc */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele HashMap<Allele, Allele> alleleMap = new HashMap<Allele, Allele>(vc.getAlleles().size()); for (Allele originalAllele : vc.getAlleles()) { Allele newAllele; if (originalAllele.isNoCall()) newAllele = originalAllele; else newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); alleleMap.put(originalAllele, newAllele); } // create new Genotype objects GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for (final Genotype genotype : vc.getGenotypes()) { List<Allele> newAlleles = new ArrayList<Allele>(); for (Allele allele : genotype.getAlleles()) { Allele newAllele = alleleMap.get(allele); if (newAllele == null) newAllele = Allele.NO_CALL; newAlleles.add(newAllele); } newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); } return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); } /** * Returns true iff VC is an non-complex indel where every allele represents an expansion or * contraction of a series of identical bases in the reference. * * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT * * If VC = -/CT, then this function returns true because the CT insertion matches exactly the * upcoming reference. * If VC = -/CTA then this function returns false because the CTA isn't a perfect match * * Now consider deletions: * * If VC = CT/- then again the same logic applies and this returns true * The case of CTA/- makes no sense because it doesn't actually match the reference bases. * * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For * each insertion allele of n bases, check if that allele matches the next n reference bases. * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, * as it must necessarily match the first n bases. If this test returns true for all * alleles you are a tandem repeat, otherwise you are not. * * @param vc * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference * @return */ @Requires({ "vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0" }) public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); if (!vc.isIndel()) // only indels are tandem repeats return false; final Allele ref = vc.getReference(); for (final Allele allele : vc.getAlternateAlleles()) { if (!isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad)) return false; } // we've passed all of the tests, so we are a repeat return true; } /** * * @param vc * @param refBasesStartingAtVCWithPad * @return */ @Requires({ "vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0" }) public static Pair<List<Integer>, byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { final boolean VERBOSE = false; final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); if (!vc.isIndel()) // only indels are tandem repeats return null; final Allele refAllele = vc.getReference(); final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); byte[] repeatUnit = null; final ArrayList<Integer> lengths = new ArrayList<Integer>(); for (final Allele allele : vc.getAlternateAlleles()) { Pair<int[], byte[]> result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); final int[] repetitionCount = result.first; // repetition count = 0 means allele is not a tandem expansion of context if (repetitionCount[0] == 0 || repetitionCount[1] == 0) return null; if (lengths.size() == 0) { lengths.add(repetitionCount[0]); // add ref allele length only once } lengths.add(repetitionCount[1]); // add this alt allele's length repeatUnit = result.second; if (VERBOSE) { System.out.println("RefContext:" + refBasesStartingAtVCWithoutPad); System.out.println("Ref:" + refAllele.toString() + " Count:" + String.valueOf(repetitionCount[0])); System.out.println("Allele:" + allele.toString() + " Count:" + String.valueOf(repetitionCount[1])); System.out.println("RU:" + new String(repeatUnit)); } } return new Pair<List<Integer>, byte[]>(lengths, repeatUnit); } public static Pair<int[], byte[]> getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. */ byte[] longB; // find first repeat unit based on either ref or alt, whichever is longer if (altBases.length > refBases.length) longB = altBases; else longB = refBases; // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units // for example, -*,CACA needs to first be decomposed into (CA)2 final int repeatUnitLength = findRepeatedSubstring(longB); final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); final int[] repetitionCount = new int[2]; // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) int repetitionsInRef = findNumberofRepetitions(repeatUnit, refBases, true); repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true) - repetitionsInRef; repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true) - repetitionsInRef; return new Pair<int[], byte[]>(repetitionCount, repeatUnit); } /** * Find out if a string can be represented as a tandem number of substrings. * For example ACTACT is a 2-tandem of ACT, * but ACTACA is not. * * @param bases String to be tested * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't * be represented as one, it will be just the length of the input string) */ public static int findRepeatedSubstring(byte[] bases) { int repLength; for (repLength = 1; repLength <= bases.length; repLength++) { final byte[] candidateRepeatUnit = Arrays.copyOf(bases, repLength); boolean allBasesMatch = true; for (int start = repLength; start < bases.length; start += repLength) { // check that remaining of string is exactly equal to repeat unit final byte[] basePiece = Arrays.copyOfRange(bases, start, start + candidateRepeatUnit.length); if (!Arrays.equals(candidateRepeatUnit, basePiece)) { allBasesMatch = false; break; } } if (allBasesMatch) return repLength; } return repLength; } /** * Helper routine that finds number of repetitions a string consists of. * For example, for string ATAT and repeat unit AT, number of repetitions = 2 * @param repeatUnit Substring * @param testString String to test * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's */ public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { int numRepeats = 0; if (lookForward) { // look forward on the test string for (int start = 0; start < testString.length; start += repeatUnit.length) { int end = start + repeatUnit.length; byte[] unit = Arrays.copyOfRange(testString, start, end); if (Arrays.equals(unit, repeatUnit)) numRepeats++; else break; } return numRepeats; } // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 // look forward on the test string for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { int end = start + repeatUnit.length; byte[] unit = Arrays.copyOfRange(testString, start, end); if (Arrays.equals(unit, repeatUnit)) numRepeats++; else break; } return numRepeats; } /** * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference * @param ref * @param alt * @param refBasesStartingAtVCWithoutPad * @return */ protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { if (!Allele.oneIsPrefixOfOther(ref, alt)) return false; // we require one allele be a prefix of another if (ref.length() > alt.length()) { // we are a deletion return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); } else { // we are an insertion return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); } } protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { final String potentialRepeat = l.substring(s.length()); // skip s bases for (int i = 0; i < minNumberOfMatches; i++) { final int start = i * potentialRepeat.length(); final int end = (i + 1) * potentialRepeat.length(); if (ref.length() < end) return false; // we ran out of bases to test final String refSub = ref.substring(start, end); if (!refSub.equals(potentialRepeat)) return false; // repeat didn't match, fail } return true; // we passed all tests, we matched } /** * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * * @param vc variant context with genotype likelihoods * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** * @param assignGenotypes true if we should update the genotypes based on the (subsetted) PLs * @return genotypes */ public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, final List<Allele> allelesToUse, final boolean assignGenotypes) { // the genotypes with PLs final GenotypesContext oldGTs = vc.getGenotypes(); // the new genotypes to create final GenotypesContext newGTs = GenotypesContext.create(); // optimization: if no input genotypes, just exit if (oldGTs.isEmpty()) return newGTs; // samples final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName(); // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); final int numNewAltAlleles = allelesToUse.size() - 1; // which PLs should be carried forward? ArrayList<Integer> likelihoodIndexesToUse = null; // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, // then we can keep the PLs as is; otherwise, we determine which ones to keep if (numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0) { likelihoodIndexesToUse = new ArrayList<Integer>(30); final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; for (int i = 0; i < numOriginalAltAlleles; i++) { if (allelesToUse.contains(vc.getAlternateAllele(i))) altAlleleIndexToUse[i] = true; } // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY); for (int PLindex = 0; PLindex < numLikelihoods; PLindex++) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods .getAllelePair(PLindex); // consider this entry only if both of the alleles are good if ((alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1])) likelihoodIndexesToUse.add(PLindex); } } // create the new genotypes for (int k = 0; k < oldGTs.size(); k++) { final Genotype g = oldGTs.get(sampleIndices.get(k)); if (!g.hasLikelihoods()) { newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); continue; } // create the new likelihoods array from the alleles we are allowed to use final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); double[] newLikelihoods; if (likelihoodIndexesToUse == null) { newLikelihoods = originalLikelihoods; } else if (originalLikelihoods.length != expectedNumLikelihoods) { logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); newLikelihoods = null; } else { newLikelihoods = new double[likelihoodIndexesToUse.size()]; int newIndex = 0; for (int oldIndex : likelihoodIndexesToUse) newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; // might need to re-normalize newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); } // if there is no mass on the (new) likelihoods, then just no-call the sample if (newLikelihoods != null && MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL) { newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); } else { final GenotypeBuilder gb = new GenotypeBuilder(g); if (newLikelihoods == null || numNewAltAlleles == 0) gb.noPL(); else gb.PL(newLikelihoods); // if we weren't asked to assign a genotype, then just no-call the sample if (!assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL) { gb.alleles(NO_CALL_ALLELES); } else { // find the genotype with maximum likelihoods int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods .getAllelePair(PLindex); gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); if (numNewAltAlleles != 0) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); } newGTs.add(gb.make()); } } return newGTs; } /** * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs * * @param vc variant context with genotype likelihoods * @return genotypes context */ public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { return subsetDiploidAlleles(vc, vc.getAlleles(), true); } /** * Split variant context into its biallelic components if there are more than 2 alleles * * For VC has A/B/C alleles, returns A/B and A/C contexts. * Genotypes are all no-calls now (it's not possible to fix them easily) * Alleles are right trimmed to satisfy VCF conventions * * If vc is biallelic or non-variant it is just returned * * Chromosome counts are updated (but they are by definition 0) * * @param vc a potentially multi-allelic variant context * @return a list of bi-allelic (or monomorphic) variant context */ public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc) { return splitVariantContextToBiallelics(vc, false); } /** * Split variant context into its biallelic components if there are more than 2 alleles * * For VC has A/B/C alleles, returns A/B and A/C contexts. * Genotypes are all no-calls now (it's not possible to fix them easily) * Alleles are right trimmed to satisfy VCF conventions * * If vc is biallelic or non-variant it is just returned * * Chromosome counts are updated (but they are by definition 0) * * @param vc a potentially multi-allelic variant context * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome * @return a list of bi-allelic (or monomorphic) variant context */ public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft) { if (!vc.isVariant() || vc.isBiallelic()) // non variant or biallelics already satisfy the contract return Collections.singletonList(vc); else { final List<VariantContext> biallelics = new LinkedList<VariantContext>(); for (final Allele alt : vc.getAlternateAlleles()) { VariantContextBuilder builder = new VariantContextBuilder(vc); final List<Allele> alleles = Arrays.asList(vc.getReference(), alt); builder.alleles(alleles); builder.genotypes(subsetDiploidAlleles(vc, alleles, false)); VariantContextUtils.calculateChromosomeCounts(builder, true); final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); biallelics.add(trimmed); } return biallelics; } } public static Genotype removePLsAndAD(final Genotype g) { return (g.hasLikelihoods() || g.hasAD()) ? new GenotypeBuilder(g).noPL().noAD().make() : g; } /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name * * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC) { int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); } /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name. * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use * SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge. * * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ * * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC) { if (unsortedVCs == null || unsortedVCs.size() == 0) return null; if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) throw new IllegalArgumentException( "the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); if (annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) throw new IllegalArgumentException( "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); final List<VariantContext> preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary final List<VariantContext> VCs = new ArrayList<VariantContext>(); for (final VariantContext vc : preFilteredVCs) { if (!filteredAreUncalled || vc.isNotFiltered()) VCs.add(vc); } if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); final Set<Allele> alleles = new LinkedHashSet<Allele>(); final Set<String> filters = new HashSet<String>(); final Map<String, Object> attributes = new LinkedHashMap<String, Object>(); final Set<String> inconsistentAttributes = new HashSet<String>(); final Set<String> variantSources = new HashSet<String>(); // contains the set of sources we found in our set of VCs that are variant final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id VariantContext longestVC = first; int depth = 0; int maxAC = -1; final Map<String, Object> attributesWithMaxAC = new LinkedHashMap<String, Object>(); double log10PError = CommonInfo.NO_LOG10_PERROR; VariantContext vcWithMaxAC = null; GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches for (final VariantContext vc : VCs) { if (longestVC.getStart() != vc.getStart()) throw new IllegalStateException( "BUG: attempting to merge VariantContexts with different start sites: first=" + first.toString() + " second=" + vc.toString()); if (VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC)) longestVC = vc; // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; if (vc.isVariant()) variantSources.add(vc.getSource()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); alleles.addAll(alleleMapping.values()); mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); // We always take the QUAL of the first VC with a non-MISSING qual for the combined value if (log10PError == CommonInfo.NO_LOG10_PERROR) log10PError = vc.getLog10PError(); filters.addAll(vc.getFilters()); // // add attributes // // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (vc.hasID()) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List<String> alleleCountArray = Arrays .asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1) .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); for (String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } else { final int ac = Integer.valueOf(rawAlleleCounts); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } for (final Map.Entry<String, Object> p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if (!inconsistentAttributes.contains(key)) { final boolean alreadyFound = attributes.containsKey(key); final Object boundValue = attributes.get(key); final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) { // we found the value but we're inconsistent, put it in the exclude list //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); inconsistentAttributes.add(key); attributes.remove(key); } else if (!alreadyFound || boundIsMissingValue) { // no value //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); attributes.put(key, p.getValue()); } } } } // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD for (final VariantContext vc : VCs) { if (vc.getAlleles().size() == 1) continue; if (hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { if (!genotypes.isEmpty()) { logger.debug(String.format( "Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); } genotypes = stripPLsAndAD(genotypes); // this will remove stale AC,AF attributed from vc VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); break; } } // take the VC with the maxAC and pull the attributes into a modifiable map if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); } // if at least one record was unfiltered and we want a union, clear all of the filters if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear(); if (annotateOrigin) { // we care about where the call came from String setValue; if (nFiltered == 0 && variantSources.size() == originalNumOfVCs) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if (nFiltered == VCs.size()) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; else if (variantSources.isEmpty()) // everyone was reference setValue = MERGE_REF_IN_ALL; else { final LinkedHashSet<String> s = new LinkedHashSet<String>(); for (final VariantContext vc : VCs) if (vc.isVariant()) s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource()); setValue = Utils.join("-", s); } if (setKey != null) { attributes.put(setKey, setValue); if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.put(setKey, setValue); } } } if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); builder.alleles(alleles); builder.genotypes(genotypes); builder.log10PError(log10PError); builder.filters(filters.isEmpty() ? filters : new TreeSet<String>(filters)); builder.attributes(new TreeMap<String, Object>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged); return merged; } private static final boolean hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) { final Iterator<Allele> it1 = alleleSet1.iterator(); final Iterator<Allele> it2 = alleleSet2.iterator(); while (it1.hasNext() && it2.hasNext()) { final Allele a1 = it1.next(); final Allele a2 = it2.next(); if (!a1.equals(a2)) return true; } // by this point, at least one of the iterators is empty. All of the elements // we've compared are equal up until this point. But it's possible that the // sets aren't the same size, which is indicated by the test below. If they // are of the same size, though, the sets are compatible return it1.hasNext() || it2.hasNext(); } public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); for (final Genotype g : genotypes) { newGs.add(removePLsAndAD(g)); } return newGs; } static private Allele determineReferenceAllele(List<VariantContext> VCs) { Allele ref = null; for (VariantContext vc : VCs) { Allele myRef = vc.getReference(); if (ref == null || ref.length() < myRef.length()) ref = myRef; else if (ref.length() == myRef.length() && !ref.equals(myRef)) throw new TribbleException(String.format( "The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); } return ref; } static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set<Allele> allAlleles) { if (refAllele.equals(vc.getReference())) return new AlleleMapper(vc); else { // we really need to do some work. The refAllele is the longest reference allele seen at this // start site. So imagine it is: // // refAllele: ACGTGA // myRef: ACGT // myAlt: A // // We need to remap all of the alleles in vc to include the extra GA so that // myRef => refAllele and myAlt => AGA // Allele myRef = vc.getReference(); if (refAllele.length() <= myRef.length()) throw new IllegalStateException("BUG: myRef=" + myRef + " is longer than refAllele=" + refAllele); byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); // System.out.printf("Remapping allele at %s%n", vc); // System.out.printf("ref %s%n", refAllele); // System.out.printf("myref %s%n", myRef ); // System.out.printf("extrabases %s%n", new String(extraBases)); Map<Allele, Allele> map = new HashMap<Allele, Allele>(); for (Allele a : vc.getAlleles()) { if (a.isReference()) map.put(a, refAllele); else { Allele extended = Allele.extend(a, extraBases); for (Allele b : allAlleles) if (extended.equals(b)) extended = b; // System.out.printf(" Extending %s => %s%n", a, extended); map.put(a, extended); } } // debugging // System.out.printf("mapping %s%n", map); return new AlleleMapper(map); } } public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, GenotypeMergeType mergeOption) { if (mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null) throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); if (priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED) return new ArrayList<VariantContext>(unsortedVCs); else { ArrayList<VariantContext> sorted = new ArrayList<VariantContext>(unsortedVCs); Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); return sorted; } } private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE for (Genotype g : oneVC.getGenotypes()) { String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); if (!mergedGenotypes.containsSample(name)) { // only add if the name is new Genotype newG = g; if (uniqifySamples || alleleMapping.needsRemapping()) { final List<Allele> alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); } mergedGenotypes.add(newG); } } } public static String mergedSampleName(String trackName, String sampleName, boolean uniqify) { return uniqify ? sampleName + "." + trackName : sampleName; } /** * Trim the alleles in inputVC from the reverse direction * * @param inputVC a non-null input VC whose alleles might need a haircut * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up */ public static VariantContext reverseTrimAlleles(final VariantContext inputVC) { return trimAlleles(inputVC, false, true); } /** * Trim the alleles in inputVC from the forward direction * * @param inputVC a non-null input VC whose alleles might need a haircut * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up */ public static VariantContext forwardTrimAlleles(final VariantContext inputVC) { return trimAlleles(inputVC, true, false); } /** * Trim the alleles in inputVC forward and reverse, as requested * * @param inputVC a non-null input VC whose alleles might need a haircut * @param trimForward should we trim up the alleles from the foward direction? * @param trimReverse shold we trim up the alleles from the reverse direction? * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles */ @Ensures("result != null") public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { if (inputVC == null) throw new IllegalArgumentException("inputVC cannot be null"); if (inputVC.getNAlleles() <= 1 || inputVC.isSNP()) return inputVC; // see whether we need to trim common reference base from all alleles final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; final VariantContext vc = trimAlleles(revTrimVC, fwdTrim, 0); return vc; } /** * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and * the last revTrim bases from the end * * @param inputVC a non-null input VC * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles * @param revTrim the last revTrim bases of each allele will be clipped off as well * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles */ @Requires({ "inputVC != null" }) @Ensures("result != null") protected static VariantContext trimAlleles(final VariantContext inputVC, final int fwdTrimEnd, final int revTrim) { if (fwdTrimEnd == -1 && revTrim == 0) // nothing to do, so just return inputVC unmodified return inputVC; final List<Allele> alleles = new LinkedList<Allele>(); final Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>(); for (final Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { alleles.add(a); originalToTrimmedAlleleMap.put(a, a); } else { // get bases for current allele and create a new one with trimmed bases final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd + 1, a.length() - revTrim); final Allele trimmedAllele = Allele.create(newBases, a.isReference()); alleles.add(trimmedAllele); originalToTrimmedAlleleMap.put(a, trimmedAllele); } } // now we can recreate new genotypes with trimmed alleles final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); final int start = inputVC.getStart() + (fwdTrimEnd + 1); final VariantContextBuilder builder = new VariantContextBuilder(inputVC); builder.start(start); builder.stop(start + alleles.get(0).length() - 1); builder.alleles(alleles); builder.genotypes(genotypes); return builder.make(); } @Requires("originalGenotypes != null && alleleMapper != null") protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { final GenotypesContext updatedGenotypes = GenotypesContext.create(); for (final Genotype genotype : originalGenotypes) { final List<Allele> updatedAlleles = alleleMapper.remap(genotype.getAlleles()); updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); } return updatedGenotypes; } public static int computeReverseClipping(final List<Allele> unclippedAlleles, final byte[] ref) { int clipping = 0; boolean stillClipping = true; while (stillClipping) { for (final Allele a : unclippedAlleles) { if (a.isSymbolic()) continue; // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). if (a.length() - clipping == 0) return clipping - 1; if (a.length() - clipping <= 0 || a.length() == 0) { stillClipping = false; } else if (ref.length == clipping) { return -1; } else if (a.getBases()[a.length() - clipping - 1] != ref[ref.length - clipping - 1]) { stillClipping = false; } } if (stillClipping) clipping++; } return clipping; } /** * Clip out any unnecessary bases off the front of the alleles * * The VCF spec represents alleles as block substitutions, replacing AC with A for a * 1 bp deletion of the C. However, it's possible that we'd end up with alleles that * contain extra bases on the left, such as GAC/GA to represent the same 1 bp deletion. * This routine finds an offset among all alleles that can be safely trimmed * off the left of each allele and still represent the same block substitution. * * A/C => A/C * AC/A => AC/A * ACC/AC => CC/C * AGT/CAT => AGT/CAT * <DEL>/C => <DEL>/C * * @param unclippedAlleles a non-null list of alleles that we want to clip * @return the offset into the alleles where we can safely clip, inclusive, or * -1 if no clipping is tolerated. So, if the result is 0, then we can remove * the first base of every allele. If the result is 1, we can remove the * second base. */ public static int computeForwardClipping(final List<Allele> unclippedAlleles) { // cannot clip unless there's at least 1 alt allele if (unclippedAlleles.size() <= 1) return -1; // we cannot forward clip any set of alleles containing a symbolic allele int minAlleleLength = Integer.MAX_VALUE; for (final Allele a : unclippedAlleles) { if (a.isSymbolic()) return -1; minAlleleLength = Math.min(minAlleleLength, a.length()); } final byte[] firstAlleleBases = unclippedAlleles.get(0).getBases(); int indexOflastSharedBase = -1; // the -1 to the stop is that we can never clip off the right most base for (int i = 0; i < minAlleleLength - 1; i++) { final byte base = firstAlleleBases[i]; for (final Allele allele : unclippedAlleles) { if (allele.getBases()[i] != base) return indexOflastSharedBase; } indexOflastSharedBase = i; } return indexOflastSharedBase; } public static double computeHardyWeinbergPvalue(VariantContext vc) { if (vc.getCalledChrCount() == 0) return 0.0; return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); } public static boolean requiresPaddingBase(final List<String> alleles) { // see whether one of the alleles would be null if trimmed through for (final String allele : alleles) { if (allele.isEmpty()) return true; } int clipping = 0; Character currentBase = null; while (true) { for (final String allele : alleles) { if (allele.length() - clipping == 0) return true; char myBase = allele.charAt(clipping); if (currentBase == null) currentBase = myBase; else if (currentBase != myBase) return false; } clipping++; currentBase = null; } } private final static Map<String, Object> subsetAttributes(final CommonInfo igc, final Collection<String> keysToPreserve) { Map<String, Object> attributes = new HashMap<String, Object>(keysToPreserve.size()); for (final String key : keysToPreserve) { if (igc.hasAttribute(key)) attributes.put(key, igc.getAttribute(key)); } return attributes; } /** * @deprecated use variant context builder version instead * @param vc the variant context * @param keysToPreserve the keys to preserve * @return a pruned version of the original variant context */ @Deprecated public static VariantContext pruneVariantContext(final VariantContext vc, Collection<String> keysToPreserve) { return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); } public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection<String> keysToPreserve) { final VariantContext vc = builder.make(); if (keysToPreserve == null) keysToPreserve = Collections.emptyList(); // VC info final Map<String, Object> attributes = subsetAttributes(vc.getCommonInfo(), keysToPreserve); // Genotypes final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); for (final Genotype g : vc.getGenotypes()) { final GenotypeBuilder gb = new GenotypeBuilder(g); // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ gb.noAD().noDP().noPL().noAttributes(); genotypes.add(gb.make()); } return builder.genotypes(genotypes).attributes(attributes); } public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { // if all alleles of vc1 are a contained in alleles of vc2, return true if (!vc1.getReference().equals(vc2.getReference())) return false; for (Allele a : vc1.getAlternateAlleles()) { if (!vc2.getAlternateAlleles().contains(a)) return false; } return true; } public static Map<VariantContext.Type, List<VariantContext>> separateVariantContextsByType( Collection<VariantContext> VCs) { HashMap<VariantContext.Type, List<VariantContext>> mappedVCs = new HashMap<VariantContext.Type, List<VariantContext>>(); for (VariantContext vc : VCs) { // look at previous variant contexts of different type. If: // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) // c) neither: do nothing, just add vc to its own list boolean addtoOwnList = true; for (VariantContext.Type type : VariantContext.Type.values()) { if (type.equals(vc.getType())) continue; if (!mappedVCs.containsKey(type)) continue; List<VariantContext> vcList = mappedVCs.get(type); for (int k = 0; k < vcList.size(); k++) { VariantContext otherVC = vcList.get(k); if (allelesAreSubset(otherVC, vc)) { // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list vcList.remove(k); // avoid having empty lists if (vcList.size() == 0) mappedVCs.remove(type); if (!mappedVCs.containsKey(vc.getType())) mappedVCs.put(vc.getType(), new ArrayList<VariantContext>()); mappedVCs.get(vc.getType()).add(otherVC); break; } else if (allelesAreSubset(vc, otherVC)) { // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own mappedVCs.get(type).add(vc); addtoOwnList = false; break; } } } if (addtoOwnList) { if (!mappedVCs.containsKey(vc.getType())) mappedVCs.put(vc.getType(), new ArrayList<VariantContext>()); mappedVCs.get(vc.getType()).add(vc); } } return mappedVCs; } public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set<String> allowedAttributes) { if (allowedAttributes == null) return vc; GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for (final Genotype genotype : vc.getGenotypes()) { Map<String, Object> attrs = new HashMap<String, Object>(); for (Map.Entry<String, Object> attr : genotype.getExtendedAttributes().entrySet()) { if (allowedAttributes.contains(attr.getKey())) attrs.put(attr.getKey(), attr.getValue()); } newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); } return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); } protected static class AlleleMapper { private VariantContext vc = null; private Map<Allele, Allele> map = null; public AlleleMapper(VariantContext vc) { this.vc = vc; } public AlleleMapper(Map<Allele, Allele> map) { this.map = map; } public boolean needsRemapping() { return this.map != null; } public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); } public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } public List<Allele> remap(List<Allele> as) { List<Allele> newAs = new ArrayList<Allele>(); for (Allele a : as) { //System.out.printf(" Remapping %s => %s%n", a, remap(a)); newAs.add(remap(a)); } return newAs; } } private static class CompareByPriority implements Comparator<VariantContext>, Serializable { List<String> priorityListOfVCs; public CompareByPriority(List<String> priorityListOfVCs) { this.priorityListOfVCs = priorityListOfVCs; } private int getIndex(VariantContext vc) { int i = priorityListOfVCs.indexOf(vc.getSource()); if (i == -1) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); return i; } public int compare(VariantContext vc1, VariantContext vc2) { return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); } } /** * For testing purposes only. Create a site-only VariantContext at contig:start containing alleles * * @param name the name of the VC * @param contig the contig for the VC * @param start the start of the VC * @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the * alt. Will compute the stop of the VC from the length of the reference allele * @return a non-null VariantContext */ public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List<String> alleleStrings) { if (alleleStrings == null || alleleStrings.isEmpty()) throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); final List<Allele> alleles = new LinkedList<Allele>(); final int length = alleleStrings.get(0).length(); boolean first = true; for (final String alleleString : alleleStrings) { alleles.add(Allele.create(alleleString, first)); first = false; } return new VariantContextBuilder(name, contig, start, start + length - 1, alleles).make(); } /** * Splits the alleles for the provided variant context into its primitive parts. * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. * Currently works only for MNPs. * * @param vc the non-null VC to split * @return a non-empty list of VCs split into primitive parts or the original VC otherwise */ public static List<VariantContext> splitIntoPrimitiveAlleles(final VariantContext vc) { if (vc == null) throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); if (!vc.isBiallelic()) throw new IllegalArgumentException( "Trying to break a multi-allelic Variant Context into primitive parts"); // currently only works for MNPs if (!vc.isMNP()) return Arrays.asList(vc); final byte[] ref = vc.getReference().getBases(); final byte[] alt = vc.getAlternateAllele(0).getBases(); if (ref.length != alt.length) throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); final List<VariantContext> result = new ArrayList<VariantContext>(ref.length); for (int i = 0; i < ref.length; i++) { // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) if (ref[i] != alt[i]) { // create the ref and alt SNP alleles final Allele newRefAllele = Allele.create(ref[i], true); final Allele newAltAllele = Allele.create(alt[i], false); // create a new VariantContext with the new SNP alleles final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i) .stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); // create new genotypes with updated alleles final Map<Allele, Allele> alleleMap = new HashMap<Allele, Allele>(); alleleMap.put(vc.getReference(), newRefAllele); alleleMap.put(vc.getAlternateAllele(0), newAltAllele); final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); result.add(newVC.genotypes(newGenotypes).make()); } } if (result.isEmpty()) result.add(vc); return result; } /** * Are vc1 and 2 equal including their position and alleles? * @param vc1 non-null VariantContext * @param vc2 non-null VariantContext * @return true if vc1 and vc2 are equal, false otherwise */ public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { if (vc1 == null) throw new IllegalArgumentException("vc1 cannot be null"); if (vc2 == null) throw new IllegalArgumentException("vc2 cannot be null"); if (vc1.getStart() != vc2.getStart()) return false; if (vc1.getEnd() != vc2.getEnd()) return false; if (!vc1.getChr().equals(vc2.getChr())) return false; if (!vc1.getAlleles().equals(vc2.getAlleles())) return false; return true; } }