Java tutorial
/* * Copyright (c) 2010. The Broad Institute * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.util.StringUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.Serializable; import java.util.*; public class VariantContextUtils { final public static JexlEngine engine = new JexlEngine(); static { engine.setSilent(false); // will throw errors now for selects that don't evaluate properly engine.setLenient(false); } /** * Create a new VariantContext * * @param name name * @param loc location * @param alleles alleles * @param genotypes genotypes set * @param negLog10PError qual * @param filters filters: use null for unfiltered and empty set for passes filters * @param attributes attributes * @return VariantContext object */ public static VariantContext toVC(String name, GenomeLoc loc, Collection<Allele> alleles, Collection<Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) { return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes != null ? VariantContext.genotypeCollectionToMap(new TreeMap<String, Genotype>(), genotypes) : null, negLog10PError, filters, attributes); } /** * Create a new variant context without genotypes and no Perror, no filters, and no attributes * @param name name * @param loc location * @param alleles alleles * @return VariantContext object */ public static VariantContext toVC(String name, GenomeLoc loc, Collection<Allele> alleles) { return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, VariantContext.NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); } /** * Create a new variant context without genotypes and no Perror, no filters, and no attributes * @param name name * @param loc location * @param alleles alleles * @param genotypes genotypes * @return VariantContext object */ public static VariantContext toVC(String name, GenomeLoc loc, Collection<Allele> alleles, Collection<Genotype> genotypes) { return new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); } /** * Copy constructor * * @param other the VariantContext to copy * @return VariantContext object */ public static VariantContext toVC(VariantContext other) { return new VariantContext(other.getSource(), other.getChr(), other.getStart(), other.getEnd(), other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.getFilters(), other.getAttributes()); } /** * Update the attributes of the attributes map given the VariantContext to reflect the proper chromosome-based VCF tags * * @param vc the VariantContext * @param attributes the attributes map to populate; must not be null; may contain old values * @param removeStaleValues should we remove stale values from the mapping? */ public static void calculateChromosomeCounts(VariantContext vc, Map<String, Object> attributes, boolean removeStaleValues) { // if everyone is a no-call, remove the old attributes if requested if (vc.getChromosomeCount() == 0 && removeStaleValues) { if (attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY)) attributes.remove(VCFConstants.ALLELE_COUNT_KEY); if (attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY)) attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); if (attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY)) attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); return; } if (vc.hasGenotypes()) { attributes.put(VCFConstants.ALLELE_NUMBER_KEY, vc.getChromosomeCount()); // if there are alternate alleles, record the relevant tags if (vc.getAlternateAlleles().size() > 0) { ArrayList<String> alleleFreqs = new ArrayList<String>(); ArrayList<Integer> alleleCounts = new ArrayList<Integer>(); double totalChromosomes = (double) vc.getChromosomeCount(); for (Allele allele : vc.getAlternateAlleles()) { int altChromosomes = vc.getChromosomeCount(allele); alleleCounts.add(altChromosomes); String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalChromosomes), ((double) altChromosomes / totalChromosomes)); alleleFreqs.add(freq); } attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); } else { attributes.put(VCFConstants.ALLELE_COUNT_KEY, 0); attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0); } } } private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { int precision = 1; while (maxValue > 1) { precision++; maxValue /= 10.0; } return "%." + precision + "f"; } /** * A simple but common wrapper for matching VariantContext objects using JEXL expressions */ public static class JexlVCMatchExp { public String name; public Expression exp; /** * Create a new matcher expression with name and JEXL expression exp * @param name name * @param exp expression */ public JexlVCMatchExp(String name, Expression exp) { this.name = name; this.exp = exp; } } /** * Method for creating JexlVCMatchExp from input walker arguments names and exps. These two arrays contain * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to * match() below. * * @param names names * @param exps expressions * @return list of matches */ public static List<JexlVCMatchExp> initializeMatchExps(String[] names, String[] exps) { if (names == null || exps == null) throw new ReviewedStingException("BUG: neither names nor exps can be null: names " + Arrays.toString(names) + " exps=" + Arrays.toString(exps)); if (names.length != exps.length) throw new UserException("Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names) + " exps=" + Arrays.toString(exps)); Map<String, String> map = new HashMap<String, String>(); for (int i = 0; i < names.length; i++) { map.put(names[i], exps[i]); } return VariantContextUtils.initializeMatchExps(map); } public static List<JexlVCMatchExp> initializeMatchExps(ArrayList<String> names, ArrayList<String> exps) { String[] nameArray = new String[names.size()]; String[] expArray = new String[exps.size()]; return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray)); } /** * Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps. These two arrays contain * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to * match() below. * * @param names_and_exps mapping of names to expressions * @return list of matches */ public static List<JexlVCMatchExp> initializeMatchExps(Map<String, String> names_and_exps) { List<JexlVCMatchExp> exps = new ArrayList<JexlVCMatchExp>(); for (Map.Entry<String, String> elt : names_and_exps.entrySet()) { String name = elt.getKey(); String expStr = elt.getValue(); if (name == null || expStr == null) throw new IllegalArgumentException("Cannot create null expressions : " + name + " " + expStr); try { Expression exp = engine.createExpression(expStr); exps.add(new JexlVCMatchExp(name, exp)); } catch (Exception e) { throw new UserException.BadArgumentValue(name, "Invalid expression used (" + expStr + "). Please see the JEXL docs for correct syntax."); } } return exps; } /** * Returns true if exp match VC. See collection<> version for full docs. * @param vc variant context * @param exp expression * @return true if there is a match */ public static boolean match(VariantContext vc, JexlVCMatchExp exp) { return match(vc, Arrays.asList(exp)).get(exp); } /** * Matches each JexlVCMatchExp exp against the data contained in vc, and returns a map from these * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL * expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp * expressions. * * @param vc variant context * @param exps expressions * @return true if there is a match */ public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Collection<JexlVCMatchExp> exps) { return new JEXLMap(exps, vc); } /** * Returns true if exp match VC/g. See collection<> version for full docs. * @param vc variant context * @param g genotype * @param exp expression * @return true if there is a match */ public static boolean match(VariantContext vc, Genotype g, JexlVCMatchExp exp) { return match(vc, g, Arrays.asList(exp)).get(exp); } /** * Matches each JexlVCMatchExp exp against the data contained in vc/g, and returns a map from these * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL * expressions to VariantContext records/genotypes. Use initializeMatchExps() to create the list of JexlVCMatchExp * expressions. * * @param vc variant context * @param g genotype * @param exps expressions * @return true if there is a match */ public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Genotype g, Collection<JexlVCMatchExp> exps) { return new JEXLMap(exps, vc, g); } public static double computeHardyWeinbergPvalue(VariantContext vc) { if (vc.getChromosomeCount() == 0) return 0.0; return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); } /** * Returns a newly allocated VC that is the same as VC, but without genotypes * @param vc variant context * @return new VC without genotypes */ @Requires("vc != null") @Ensures("result != null") public static VariantContext sitesOnlyVariantContext(VariantContext vc) { return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); } /** * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes * @param vcs collection of VCs * @return new VCs without genotypes */ @Requires("vcs != null") @Ensures("result != null") public static Collection<VariantContext> sitesOnlyVariantContexts(Collection<VariantContext> vcs) { List<VariantContext> r = new ArrayList<VariantContext>(); for (VariantContext vc : vcs) r.add(sitesOnlyVariantContext(vc)); return r; } public static VariantContext pruneVariantContext(VariantContext vc) { return pruneVariantContext(vc, null); } public static VariantContext pruneVariantContext(VariantContext vc, Collection<String> keysToPreserve) { MutableVariantContext mvc = new MutableVariantContext(vc); if (keysToPreserve == null || keysToPreserve.size() == 0) mvc.clearAttributes(); else { Map<String, Object> d = mvc.getAttributes(); mvc.clearAttributes(); for (String key : keysToPreserve) if (d.containsKey(key)) mvc.putAttribute(key, d.get(key)); } Collection<Genotype> gs = mvc.getGenotypes().values(); mvc.clearGenotypes(); for (Genotype g : gs) { MutableGenotype mg = new MutableGenotype(g); mg.clearAttributes(); if (keysToPreserve != null) for (String key : keysToPreserve) if (g.hasAttribute(key)) mg.putAttribute(key, g.getAttribute(key)); mvc.addGenotype(mg); } return mvc; } public enum GenotypeMergeType { UNIQUIFY, PRIORITIZE, UNSORTED, REQUIRE_UNIQUE } public enum FilteredRecordMergeType { KEEP_IF_ANY_UNFILTERED, KEEP_IF_ALL_UNFILTERED } /** * Performs a master merge on the VCs. Here there is a master input [contains all of the information] and many * VCs containing partial, extra genotype information which should be added to the master. For example, * we scatter out the phasing algorithm over some samples in the master, producing a minimal VCF with phasing * information per genotype. The master merge will add the PQ information from each genotype record, where * appropriate, to the master VC. * * @param unsortedVCs collection of VCs * @param masterName name of master VC * @return master-merged VC */ public static VariantContext masterMerge(Collection<VariantContext> unsortedVCs, String masterName) { VariantContext master = findMaster(unsortedVCs, masterName); Map<String, Genotype> genotypes = master.getGenotypes(); for (Genotype g : genotypes.values()) { genotypes.put(g.getSampleName(), new MutableGenotype(g)); } Map<String, Object> masterAttributes = new HashMap<String, Object>(master.getAttributes()); for (VariantContext vc : unsortedVCs) { if (!vc.getSource().equals(masterName)) { for (Genotype g : vc.getGenotypes().values()) { MutableGenotype masterG = (MutableGenotype) genotypes.get(g.getSampleName()); for (Map.Entry<String, Object> attr : g.getAttributes().entrySet()) { if (!masterG.hasAttribute(attr.getKey())) { //System.out.printf("Adding GT attribute %s to masterG %s, new %s%n", attr, masterG, g); masterG.putAttribute(attr.getKey(), attr.getValue()); } } if (masterG.isPhased() != g.isPhased()) { if (masterG.sameGenotype(g)) { // System.out.printf("Updating phasing %s to masterG %s, new %s%n", g.isPhased(), masterG, g); masterG.setAlleles(g.getAlleles()); masterG.setPhase(g.isPhased()); } //else System.out.println("WARNING: Not updating phase, since genotypes differ between master file and auxiliary info file!"); } // if ( MathUtils.compareDoubles(masterG.getNegLog10PError(), g.getNegLog10PError()) != 0 ) { // System.out.printf("Updating GQ %s to masterG %s, new %s%n", g.getNegLog10PError(), masterG, g); // masterG.setNegLog10PError(g.getNegLog10PError()); // } } for (Map.Entry<String, Object> attr : vc.getAttributes().entrySet()) { if (!masterAttributes.containsKey(attr.getKey())) { //System.out.printf("Adding VC attribute %s to master %s, new %s%n", attr, master, vc); masterAttributes.put(attr.getKey(), attr.getValue()); } } } } return new VariantContext(master.getSource(), master.getChr(), master.getStart(), master.getEnd(), master.getAlleles(), genotypes, master.getNegLog10PError(), master.getFilters(), masterAttributes); } private static VariantContext findMaster(Collection<VariantContext> unsortedVCs, String masterName) { for (VariantContext vc : unsortedVCs) { if (vc.getSource().equals(masterName)) { return vc; } } throw new ReviewedStingException( String.format("Couldn't find master VCF %s at %s", masterName, unsortedVCs.iterator().next())); } public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection<VariantContext> unsortedVCs, byte refBase) { return simpleMerge(genomeLocParser, unsortedVCs, null, FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GenotypeMergeType.UNSORTED, false, false, refBase); } /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name * * @param genomeLocParser loc parser * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param inputRefBase the ref base * @return new VariantContext */ public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, boolean annotateOrigin, boolean printMessages, byte inputRefBase) { return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set", false, false); } /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name * * @param genomeLocParser loc parser * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param inputRefBase the ref base * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext */ public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey, boolean filteredAreUncalled, boolean mergeInfoWithMaxAC) { if (unsortedVCs == null || unsortedVCs.size() == 0) return null; if (annotateOrigin && priorityListOfVCs == null) throw new IllegalArgumentException( "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE) verifyUniqueSampleNames(unsortedVCs); List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary List<VariantContext> VCs = new ArrayList<VariantContext>(); for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if (!filteredAreUncalled || vc.isNotFiltered()) VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc, inputRefBase, false)); } if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC VariantContext first = VCs.get(0); String name = first.getSource(); GenomeLoc loc = getLocation(genomeLocParser, first); Set<Allele> alleles = new TreeSet<Allele>(); Map<String, Genotype> genotypes = new TreeMap<String, Genotype>(); double negLog10PError = -1; Set<String> filters = new TreeSet<String>(); Map<String, Object> attributes = new TreeMap<String, Object>(); Set<String> inconsistentAttributes = new HashSet<String>(); String rsID = null; int depth = 0; int maxAC = -1; Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>(); VariantContext vcWithMaxAC = null; // counting the number of filtered and variant VCs int nFiltered = 0, nVariant = 0; Allele refAllele = determineReferenceAllele(VCs); boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches for (VariantContext vc : VCs) { if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) ) throw new ReviewedStingException( "BUG: attempting to merge VariantContexts with different start sites: first=" + first.toString() + " second=" + vc.toString()); if (getLocation(genomeLocParser, vc).size() > loc.size()) loc = getLocation(genomeLocParser, vc); // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; nVariant += vc.isVariant() ? 1 : 0; AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); alleles.addAll(alleleMapping.values()); mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); filters.addAll(vc.getFilters()); // // add attributes // // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += Integer.valueOf(vc.getAttributeAsString(VCFConstants.DEPTH_KEY)); if (rsID == null && vc.hasID()) rsID = vc.getID(); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List<String> alleleCountArray = Arrays .asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1) .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); for (String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } else { final int ac = Integer.valueOf(rawAlleleCounts); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if (!inconsistentAttributes.contains(key)) { boolean alreadyFound = attributes.containsKey(key); Object boundValue = attributes.get(key); boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) { // we found the value but we're inconsistent, put it in the exclude list //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); inconsistentAttributes.add(key); attributes.remove(key); } else if (!alreadyFound || boundIsMissingValue) { // no value //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); attributes.put(key, p.getValue()); } } } } // take the VC with the maxAC and pull the attributes into a modifiable map if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); } // if at least one record was unfiltered and we want a union, clear all of the filters if (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) filters.clear(); // we care about where the call came from if (annotateOrigin) { String setValue; if (nFiltered == 0 && nVariant == priorityListOfVCs.size()) // nothing was unfiltered setValue = "Intersection"; else if (nFiltered == VCs.size()) // everything was filtered out setValue = "FilteredInAll"; else if (nVariant == 0) // everyone was reference setValue = "ReferenceInAll"; else { // we are filtered in some subset List<String> s = new ArrayList<String>(); for (VariantContext vc : VCs) if (vc.isVariant()) s.add(vc.isFiltered() ? "filterIn" + vc.getSource() : vc.getSource()); setValue = Utils.join("-", s); } if (setKey != null) { attributes.put(setKey, setValue); if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource()); } } } if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); if (rsID != null) attributes.put(VariantContext.ID_KEY, rsID); VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); // Trim the padded bases of all alleles if necessary merged = AbstractVCFCodec.createVariantContextWithTrimmedAlleles(merged); if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged); return merged; } public static Map<VariantContext.Type, List<VariantContext>> separateVariantContextsByType( Collection<VariantContext> VCs) { HashMap<VariantContext.Type, List<VariantContext>> mappedVCs = new HashMap<VariantContext.Type, List<VariantContext>>(); for (VariantContext vc : VCs) { if (!mappedVCs.containsKey(vc.getType())) mappedVCs.put(vc.getType(), new ArrayList<VariantContext>()); mappedVCs.get(vc.getType()).add(vc); } return mappedVCs; } private static class AlleleMapper { private VariantContext vc = null; private Map<Allele, Allele> map = null; public AlleleMapper(VariantContext vc) { this.vc = vc; } public AlleleMapper(Map<Allele, Allele> map) { this.map = map; } public boolean needsRemapping() { return this.map != null; } public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); } public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } public List<Allele> remap(List<Allele> as) { List<Allele> newAs = new ArrayList<Allele>(); for (Allele a : as) { //System.out.printf(" Remapping %s => %s%n", a, remap(a)); newAs.add(remap(a)); } return newAs; } } static private void verifyUniqueSampleNames(Collection<VariantContext> unsortedVCs) { Set<String> names = new HashSet<String>(); for (VariantContext vc : unsortedVCs) { for (String name : vc.getSampleNames()) { //System.out.printf("Checking %s %b%n", name, names.contains(name)); if (names.contains(name)) throw new UserException( "REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name); } names.addAll(vc.getSampleNames()); } } static private Allele determineReferenceAllele(List<VariantContext> VCs) { Allele ref = null; for (VariantContext vc : VCs) { Allele myRef = vc.getReference(); if (ref == null || ref.length() < myRef.length()) ref = myRef; else if (ref.length() == myRef.length() && !ref.equals(myRef)) throw new UserException.BadInput(String.format( "The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); } return ref; } static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set<Allele> allAlleles) { if (refAllele.equals(vc.getReference())) return new AlleleMapper(vc); else { // we really need to do some work. The refAllele is the longest reference allele seen at this // start site. So imagine it is: // // refAllele: ACGTGA // myRef: ACGT // myAlt: - // // We need to remap all of the alleles in vc to include the extra GA so that // myRef => refAllele and myAlt => GA // Allele myRef = vc.getReference(); if (refAllele.length() <= myRef.length()) throw new ReviewedStingException("BUG: myRef=" + myRef + " is longer than refAllele=" + refAllele); byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); // System.out.printf("Remapping allele at %s%n", vc); // System.out.printf("ref %s%n", refAllele); // System.out.printf("myref %s%n", myRef ); // System.out.printf("extrabases %s%n", new String(extraBases)); Map<Allele, Allele> map = new HashMap<Allele, Allele>(); for (Allele a : vc.getAlleles()) { if (a.isReference()) map.put(a, refAllele); else { Allele extended = Allele.extend(a, extraBases); for (Allele b : allAlleles) if (extended.equals(b)) extended = b; // System.out.printf(" Extending %s => %s%n", a, extended); map.put(a, extended); } } // debugging // System.out.printf("mapping %s%n", map); return new AlleleMapper(map); } } static class CompareByPriority implements Comparator<VariantContext>, Serializable { List<String> priorityListOfVCs; public CompareByPriority(List<String> priorityListOfVCs) { this.priorityListOfVCs = priorityListOfVCs; } private int getIndex(VariantContext vc) { int i = priorityListOfVCs.indexOf(vc.getSource()); if (i == -1) throw new UserException.BadArgumentValue(Utils.join(",", priorityListOfVCs), "Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); return i; } public int compare(VariantContext vc1, VariantContext vc2) { return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); } } public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, GenotypeMergeType mergeOption) { if (mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null) throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); if (priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED) return new ArrayList<VariantContext>(unsortedVCs); else { ArrayList<VariantContext> sorted = new ArrayList<VariantContext>(unsortedVCs); Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); return sorted; } } private static void mergeGenotypes(Map<String, Genotype> mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { for (Genotype g : oneVC.getGenotypes().values()) { String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); if (!mergedGenotypes.containsKey(name)) { // only add if the name is new Genotype newG = g; if (uniqifySamples || alleleMapping.needsRemapping()) { MutableGenotype mutG = new MutableGenotype(name, g); if (alleleMapping.needsRemapping()) mutG.setAlleles(alleleMapping.remap(g.getAlleles())); newG = mutG; } mergedGenotypes.put(name, newG); } } } public static String mergedSampleName(String trackName, String sampleName, boolean uniqify) { return uniqify ? sampleName + "." + trackName : sampleName; } /** * Returns a context identical to this with the REF and ALT alleles reverse complemented. * * @param vc variant context * @return new vc */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele HashMap<Allele, Allele> alleleMap = new HashMap<Allele, Allele>(vc.getAlleles().size()); for (Allele originalAllele : vc.getAlleles()) { Allele newAllele; if (originalAllele.isNoCall() || originalAllele.isNull()) newAllele = originalAllele; else newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); alleleMap.put(originalAllele, newAllele); } // create new Genotype objects Map<String, Genotype> newGenotypes = new HashMap<String, Genotype>(vc.getNSamples()); for (Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet()) { List<Allele> newAlleles = new ArrayList<Allele>(); for (Allele allele : genotype.getValue().getAlleles()) { Allele newAllele = alleleMap.get(allele); if (newAllele == null) newAllele = Allele.NO_CALL; newAlleles.add(newAllele); } newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); } return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); } public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set<String> allowedAttributes) { if (allowedAttributes == null) return vc; Map<String, Genotype> newGenotypes = new HashMap<String, Genotype>(vc.getNSamples()); for (Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet()) { Map<String, Object> attrs = new HashMap<String, Object>(); for (Map.Entry<String, Object> attr : genotype.getValue().getAttributes().entrySet()) { if (allowedAttributes.contains(attr.getKey())) attrs.put(attr.getKey(), attr.getValue()); } newGenotypes.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attrs)); } return VariantContext.modifyGenotypes(vc, newGenotypes); } public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { if (!context.isSNP() || !context.isBiallelic()) throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); } /** * If this is a BiAlleic SNP, is it a transition? */ public static boolean isTransition(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; } /** * If this is a BiAlleic SNP, is it a transversion? */ public static boolean isTransversion(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; } /** * create a genome location, given a variant context * @param genomeLocParser parser * @param vc the variant context * @return the genomeLoc */ public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser, VariantContext vc) { return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); } public abstract static class AlleleMergeRule { // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); public String toString() { return "all samples are mergeable"; } } // NOTE: returns null if vc1 and vc2 are not merged into a single MNP record public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) return null; // Check that it's logically possible to merge the VCs: if (!allSamplesAreMergeable(vc1, vc2)) return null; // Check if there's a "point" in merging the VCs (e.g., annotations could be changed) if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2)) return null; return reallyMergeIntoMNP(vc1, vc2, referenceFile); } private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { int startInter = vc1.getEnd() + 1; int endInter = vc2.getStart() - 1; byte[] intermediateBases = null; if (startInter <= endInter) { intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); StringUtil.toUpperCase(intermediateBases); } MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added Map<String, Genotype> mergedGenotypes = new HashMap<String, Genotype>(); for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) { String sample = gt1Entry.getKey(); Genotype gt1 = gt1Entry.getValue(); Genotype gt2 = vc2.getGenotype(sample); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); List<Allele> mergedAllelesForSample = new LinkedList<Allele>(); /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, we preserve phase information (if any) relative to whatever precedes vc1: */ Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); mergedAllelesForSample.add(mergedAllele); } double mergedGQ = Math.max(gt1.getNegLog10PError(), gt2.getNegLog10PError()); Set<String> mergedGtFilters = new HashSet<String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered Map<String, Object> mergedGtAttribs = new HashMap<String, Object>(); PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); if (phaseQual.PQ != null) mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); Genotype mergedGt = new Genotype(sample, mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); mergedGenotypes.put(sample, mergedGt); } String mergedName = VariantContextUtils.mergeVariantContextNames(vc1.getSource(), vc2.getSource()); double mergedNegLog10PError = Math.max(vc1.getNegLog10PError(), vc2.getNegLog10PError()); Set<String> mergedFilters = new HashSet<String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered Map<String, Object> mergedAttribs = VariantContextUtils.mergeVariantContextAttributes(vc1, vc2); VariantContext mergedVc = new VariantContext(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs); mergedAttribs = new HashMap<String, Object>(mergedVc.getAttributes()); VariantContextUtils.calculateChromosomeCounts(mergedVc, mergedAttribs, true); mergedVc = VariantContext.modifyAttributes(mergedVc, mergedAttribs); return mergedVc; } private static class AlleleOneAndTwo { private Allele all1; private Allele all2; public AlleleOneAndTwo(Allele all1, Allele all2) { this.all1 = all1; this.all2 = all2; } public int hashCode() { return all1.hashCode() + all2.hashCode(); } public boolean equals(Object other) { if (!(other instanceof AlleleOneAndTwo)) return false; AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other; return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); } } private static class MergedAllelesData { private Map<AlleleOneAndTwo, Allele> mergedAlleles; private byte[] intermediateBases; private int intermediateLength; public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { this.mergedAlleles = new HashMap<AlleleOneAndTwo, Allele>(); // implemented equals() and hashCode() for AlleleOneAndTwo this.intermediateBases = intermediateBases; this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); } public Allele ensureMergedAllele(Allele all1, Allele all2) { return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor } private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); Allele mergedAllele = mergedAlleles.get(all12); if (mergedAllele == null) { byte[] bases1 = all1.getBases(); byte[] bases2 = all2.getBases(); byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length]; System.arraycopy(bases1, 0, mergedBases, 0, bases1.length); if (intermediateBases != null) System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); mergedAlleles.put(all12, mergedAllele); } return mergedAllele; } public Set<Allele> getAllMergedAlleles() { return new HashSet<Allele>(mergedAlleles.values()); } } private static String mergeVariantContextNames(String name1, String name2) { return name1 + "_" + name2; } private static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { Map<String, Object> mergedAttribs = new HashMap<String, Object>(); List<VariantContext> vcList = new LinkedList<VariantContext>(); vcList.add(vc1); vcList.add(vc2); String[] MERGE_OR_ATTRIBS = { VCFConstants.DBSNP_KEY }; for (String orAttrib : MERGE_OR_ATTRIBS) { boolean attribVal = false; for (VariantContext vc : vcList) { Boolean val = vc.getAttributeAsBooleanNoException(orAttrib); if (val != null) attribVal = (attribVal || val); if (attribVal) // already true, so no reason to continue: break; } mergedAttribs.put(orAttrib, attribVal); } // Merge ID fields: String iDVal = null; for (VariantContext vc : vcList) { String val = vc.getAttributeAsStringNoException(VariantContext.ID_KEY); if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { if (iDVal == null) iDVal = val; else iDVal += VCFConstants.ID_FIELD_SEPARATOR + val; } } if (iDVal != null) mergedAttribs.put(VariantContext.ID_KEY, iDVal); return mergedAttribs; } private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); if (!loc1.onSameContig(loc2)) throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); if (!loc1.isBefore(loc2)) throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2"); if (vc1.isFiltered() || vc2.isFiltered()) return false; if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets return false; if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) return false; return true; } private static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) { for (Map.Entry<String, Genotype> gtEntry : vc.getGenotypes().entrySet()) { Genotype gt = gtEntry.getValue(); if (gt.isNoCall() || gt.isFiltered()) return false; } return true; } // Assumes that vc1 and vc2 were already checked to have the same sample names: private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) { String sample = gt1Entry.getKey(); Genotype gt1 = gt1Entry.getValue(); Genotype gt2 = vc2.getGenotype(sample); if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom return false; } return true; } public static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { if (gt1.getPloidy() != gt2.getPloidy()) return false; /* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard]. HOWEVER, EVEN if this is not the case, but gt1.isHom(), it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1. */ return (gt2.isPhased() || gt2.isHom() || gt1.isHom()); } private static class PhaseAndQuality { public boolean isPhased; public Double PQ = null; public PhaseAndQuality(Genotype gt) { this.isPhased = gt.isPhased(); if (this.isPhased) this.PQ = gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); } } // Assumes that alleleSegregationIsKnown(gt1, gt2): private static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { if (gt2.isPhased() || gt2.isHom()) return new PhaseAndQuality(gt1); // maintain the phase of gt1 if (!gt1.isHom()) throw new ReviewedStingException( "alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()"); /* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype For example, if we're merging the third Genotype with the second one: 0/1 1|1 0/1 Then, we want to output: 0/1 1/2 */ return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()] } /* Checks if any sample has a MNP of ALT alleles (segregating together): [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] */ public static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) { String sample = gt1Entry.getKey(); Genotype gt1 = gt1Entry.getValue(); Genotype gt2 = vc2.getGenotype(sample); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate return true; } } return false; } /* Checks if all samples are consistent in their haplotypes: [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] */ public static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>(); Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>(); // Note the segregation of the alleles for the reference genome: allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). for (Map.Entry<String, Genotype> gt1Entry : vc1.getGenotypes().entrySet()) { String sample = gt1Entry.getKey(); Genotype gt1 = gt1Entry.getValue(); Genotype gt2 = vc2.getGenotype(sample); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); Allele all1To2 = allele1ToAllele2.get(all1); if (all1To2 == null) allele1ToAllele2.put(all1, all2); else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 return false; Allele all2To1 = allele2ToAllele1.get(all2); if (all2To1 == null) allele2ToAllele1.put(all2, all1); else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 return false; } } return true; } }