htsjdk.variant.variantcontext.VariantContextUtils.java Source code

Java tutorial

Introduction

Here is the source code for htsjdk.variant.variantcontext.VariantContextUtils.java

Source

/*
* Copyright (c) 2012 The Broad Institute
* 
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* 
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package htsjdk.variant.variantcontext;

import htsjdk.samtools.util.Lazy;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class VariantContextUtils {
    private static Set<String> MISSING_KEYS_WARNED_ABOUT = new HashSet<String>();

    /** Use a {@link Lazy} {@link JexlEngine} instance to avoid class-loading issues. (Applications that access this class are otherwise
     * forced to build a {@link JexlEngine} instance, which depends on some apache logging libraries that mightn't be packaged.) */
    final public static Lazy<JexlEngine> engine = new Lazy<JexlEngine>(new Lazy.LazyInitializer<JexlEngine>() {
        @Override
        public JexlEngine make() {
            final JexlEngine jexl = new JexlEngine();
            jexl.setSilent(false); // will throw errors now for selects that don't evaluate properly
            jexl.setLenient(false);
            jexl.setDebug(false);
            return jexl;
        }
    });
    private final static boolean ASSUME_MISSING_FIELDS_ARE_STRINGS = false;

    /**
     * Computes the alternate allele frequency at the provided {@link VariantContext} by dividing its "AN" by its "AC".
     * @param vc The variant whose alternate allele frequency is computed
     * @return The alternate allele frequency in [0, 1]
     * @throws AssertionError When either annotation is missing, or when the compuated frequency is outside the expected range
     */
    public static double calculateAltAlleleFrequency(final VariantContext vc) {
        if (!vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) || !vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY))
            throw new AssertionError(String.format(
                    "Cannot compute the provided variant's alt allele frequency because it does not have both %s and %s annotations: %s",
                    VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, vc));
        final double altAlleleCount = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
        final double totalCount = vc.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0);
        final double aaf = altAlleleCount / totalCount;
        if (aaf > 1 || aaf < 0)
            throw new AssertionError(String
                    .format("Expected a minor allele frequency in the range [0, 1], but got %s. vc=%s", aaf, vc));
        return aaf;
    }

    /**
     * Update the attributes of the attributes map given the VariantContext to reflect the
     * proper chromosome-based VCF tags
     *
     * @param vc          the VariantContext
     * @param attributes  the attributes map to populate; must not be null; may contain old values
     * @param removeStaleValues should we remove stale values from the mapping?
     * @return the attributes map provided as input, returned for programming convenience
     */
    public static Map<String, Object> calculateChromosomeCounts(VariantContext vc, Map<String, Object> attributes,
            boolean removeStaleValues) {
        return calculateChromosomeCounts(vc, attributes, removeStaleValues, new HashSet<String>(0));
    }

    /**
     * Update the attributes of the attributes map given the VariantContext to reflect the
     * proper chromosome-based VCF tags
     *
     * @param vc          the VariantContext
     * @param attributes  the attributes map to populate; must not be null; may contain old values
     * @param removeStaleValues should we remove stale values from the mapping?
     * @param founderIds - Set of founders Ids to take into account. AF and FC will be calculated over the founders.
     *                  If empty or null, counts are generated for all samples as unrelated individuals
     * @return the attributes map provided as input, returned for programming convenience
     */
    public static Map<String, Object> calculateChromosomeCounts(VariantContext vc, Map<String, Object> attributes,
            boolean removeStaleValues, final Set<String> founderIds) {
        final int AN = vc.getCalledChrCount();

        // if everyone is a no-call, remove the old attributes if requested
        if (AN == 0 && removeStaleValues) {
            if (attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY))
                attributes.remove(VCFConstants.ALLELE_COUNT_KEY);
            if (attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY))
                attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY);
            if (attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY))
                attributes.remove(VCFConstants.ALLELE_NUMBER_KEY);
            return attributes;
        }

        if (vc.hasGenotypes()) {
            attributes.put(VCFConstants.ALLELE_NUMBER_KEY, AN);

            // if there are alternate alleles, record the relevant tags
            if (!vc.getAlternateAlleles().isEmpty()) {
                ArrayList<Double> alleleFreqs = new ArrayList<Double>();
                ArrayList<Integer> alleleCounts = new ArrayList<Integer>();
                ArrayList<Integer> foundersAlleleCounts = new ArrayList<Integer>();
                double totalFoundersChromosomes = (double) vc.getCalledChrCount(founderIds);
                int foundersAltChromosomes;
                for (Allele allele : vc.getAlternateAlleles()) {
                    foundersAltChromosomes = vc.getCalledChrCount(allele, founderIds);
                    alleleCounts.add(vc.getCalledChrCount(allele));
                    foundersAlleleCounts.add(foundersAltChromosomes);
                    if (AN == 0) {
                        alleleFreqs.add(0.0);
                    } else {
                        final Double freq = (double) foundersAltChromosomes / totalFoundersChromosomes;
                        alleleFreqs.add(freq);
                    }
                }

                attributes.put(VCFConstants.ALLELE_COUNT_KEY,
                        alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
                attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY,
                        alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
            } else {
                // if there's no alt AC and AF shouldn't be present
                attributes.remove(VCFConstants.ALLELE_COUNT_KEY);
                attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY);
            }
        }

        return attributes;
    }

    /**
     * Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper
     * chromosome-based VCF tags based on the current VC produced by builder.make()
     *
     * @param builder     the VariantContextBuilder we are updating
     * @param removeStaleValues should we remove stale values from the mapping?
     */
    public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues) {
        VariantContext vc = builder.make();
        builder.attributes(calculateChromosomeCounts(vc, new HashMap<>(vc.getAttributes()), removeStaleValues,
                new HashSet<>(0)));
    }

    /**
     * Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper
     * chromosome-based VCF tags based on the current VC produced by builder.make()
     *
     * @param builder     the VariantContextBuilder we are updating
     * @param founderIds - Set of founders to take into account. AF and FC will be calculated over the founders only.
     *                   If empty or null, counts are generated for all samples as unrelated individuals
     * @param removeStaleValues should we remove stale values from the mapping?
     */
    public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues,
            final Set<String> founderIds) {
        VariantContext vc = builder.make();
        builder.attributes(
                calculateChromosomeCounts(vc, new HashMap<>(vc.getAttributes()), removeStaleValues, founderIds));
    }

    public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
        VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field);
        if (metaData == null)
            metaData = header.getInfoHeaderLine(field);
        if (metaData == null) {
            if (ASSUME_MISSING_FIELDS_ARE_STRINGS) {
                if (!MISSING_KEYS_WARNED_ABOUT.contains(field)) {
                    MISSING_KEYS_WARNED_ABOUT.add(field);
                    if (GeneralUtils.DEBUG_MODE_ENABLED)
                        System.err.println("Field " + field
                                + " missing from VCF header, assuming it is an unbounded string type");
                }
                return new VCFInfoHeaderLine(field, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String,
                        "Auto-generated string header for " + field);
            } else
                throw new TribbleException(
                        "Fully decoding VariantContext requires header line for all fields, but none was found for "
                                + field);
        }
        return metaData;
    }

    /**
     * A simple but common wrapper for matching {@link VariantContext} objects using JEXL expressions
     */
    public static class JexlVCMatchExp {
        public String name;
        public Expression exp;

        /**
         * Create a new matcher expression with name and JEXL expression exp
         * @param name name
         * @param exp  expression
         * @throws IllegalArgumentException if either argument is {@code null}
         */
        public JexlVCMatchExp(String name, Expression exp) {
            if (name == null) {
                throw new IllegalArgumentException("Cannot create JexlVCMatchExp with null name.");
            }
            if (exp == null) {
                throw new IllegalArgumentException("Cannot create JexlVCMatchExp with null expression.");
            }

            this.name = name;
            this.exp = exp;
        }
    }

    /**
     * Method for creating JexlVCMatchExp from input walker arguments names and exps.  These two arrays contain
     * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
     * a list of JexlVCMatchExp, in order, that correspond to the names and exps.  These are suitable input to
     * match() below.
     *
     * @param names names
     * @param exps  expressions
     * @return list of matches
     */
    public static List<JexlVCMatchExp> initializeMatchExps(String[] names, String[] exps) {
        if (names == null || exps == null)
            throw new IllegalArgumentException("BUG: neither names nor exps can be null: names "
                    + Arrays.toString(names) + " exps=" + Arrays.toString(exps));

        if (names.length != exps.length)
            throw new IllegalArgumentException(
                    "Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names)
                            + " exps=" + Arrays.toString(exps));

        Map<String, String> map = new HashMap<String, String>();
        for (int i = 0; i < names.length; i++) {
            map.put(names[i], exps[i]);
        }

        return VariantContextUtils.initializeMatchExps(map);
    }

    /**
     * Method for creating JexlVCMatchExp from input walker arguments names and exps.  These two lists contain
     * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
     * a list of JexlVCMatchExp, in order, that correspond to the names and exps.  These are suitable input to
     * match() below.
     *
     * @param names names
     * @param exps  expressions
     * @return list of matches
     */
    public static List<JexlVCMatchExp> initializeMatchExps(List<String> names, List<String> exps) {
        String[] nameArray = new String[names.size()];
        String[] expArray = new String[exps.size()];
        return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray));
    }

    /**
     * Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps.  These two arrays contain
     * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
     * a list of JexlVCMatchExp, in order, that correspond to the names and exps.  These are suitable input to
     * match() below.
     *
     * @param names_and_exps mapping of names to expressions
     * @return list of matches
     */
    public static List<JexlVCMatchExp> initializeMatchExps(Map<String, String> names_and_exps) {
        List<JexlVCMatchExp> exps = new ArrayList<>();

        for (Map.Entry<String, String> elt : names_and_exps.entrySet()) {
            String name = elt.getKey();
            String expStr = elt.getValue();

            if (name == null || expStr == null)
                throw new IllegalArgumentException("Cannot create null expressions : " + name + " " + expStr);
            try {
                Expression exp = engine.get().createExpression(expStr);
                exps.add(new JexlVCMatchExp(name, exp));
            } catch (Exception e) {
                throw new IllegalArgumentException("Argument " + name + "has a bad value. Invalid expression used ("
                        + expStr + "). Please see the JEXL docs for correct syntax.");
            }
        }

        return exps;
    }

    /**
     * Returns true if {@code exp} match {@code vc}.
     * See {@link #match(VariantContext, Collection)} for full docs.
     * @param vc    variant context
     * @param exp   expression
     * @return      true if there is a match
     */
    public static boolean match(VariantContext vc, JexlVCMatchExp exp) {
        return match(vc, Collections.singletonList(exp)).get(exp);
    }

    /**
     * Matches each {@link JexlVCMatchExp} exp against the data contained in {@code vc},
     * and returns a map from these expressions to {@code true} (if they matched) or {@code false} (if they didn't).
     * This the best way to apply JEXL expressions to {@link VariantContext} records.
     * Use the various {@code initializeMatchExps()}'s to create the list of {@link JexlVCMatchExp} expressions.
     *
     * @param vc    variant context
     * @param exps  expressions
     * @return      true if there is a match
     */
    public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Collection<JexlVCMatchExp> exps) {
        return new JEXLMap(exps, vc);
    }

    /**
     * Returns true if {@code exp} match {@code vc}, {@code g}.
     * See {@link #match(VariantContext, Genotype, Collection)} for full docs.
     * @param vc    variant context
     * @param g     genotype
     * @param exp   expression
     * @return      true if there is a match
     */
    public static boolean match(VariantContext vc, Genotype g, JexlVCMatchExp exp) {
        return match(vc, g, Collections.singletonList(exp)).get(exp);
    }

    /**
     * Matches each {@link JexlVCMatchExp} exp against the data contained in {@code vc}, {@code g},
     * and returns a map from these expressions to {@code true} (if they matched) or {@code false} (if they didn't).
     * This the best way to apply JEXL expressions to {@link VariantContext} records.
     * Use the various {@code initializeMatchExps()}'s to create the list of {@link JexlVCMatchExp} expressions.
     *
     * @param vc    variant context
     * @param g     genotype
     * @param exps  expressions
     * @return      true if there is a match
     */
    public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Genotype g,
            Collection<JexlVCMatchExp> exps) {
        return new JEXLMap(exps, vc, g);
    }

    /**
     * Answers if the provided variant is transitional (otherwise, it's transversional).
     * Transitions:
     * A->G
     * G->A
     * C->T
     * T->C
     * <p/>
     * Transversions:
     * A->C
     * A->T
     * C->A
     * C->G
     * G->C
     * G->T
     * T->A
     * T->G
     *
     * @param vc a biallelic polymorphic SNP
     * @return true if a transition and false if transversion
     * @throws IllegalArgumentException if vc is monomorphic, not a SNP or not bi-allelic.
        
      */
    static public boolean isTransition(final VariantContext vc) throws IllegalArgumentException {
        final byte refAllele = vc.getReference().getBases()[0];
        final Collection<Allele> altAlleles = vc.getAlternateAlleles();

        if (vc.getType() == VariantContext.Type.NO_VARIATION) {
            throw new IllegalArgumentException("Variant context is monomorphic: " + vc.toString());
        }

        if (vc.getType() != VariantContext.Type.SNP) {
            throw new IllegalArgumentException("Variant context is not a SNP: " + vc.toString());
        }

        if (altAlleles.size() != 1) {
            throw new IllegalArgumentException(
                    "Expected exactly 1 alternative Allele. Found: " + altAlleles.size());
        }

        final Byte altAllele = altAlleles.iterator().next().getBases()[0];

        return (refAllele == 'A' && altAllele == 'G') || (refAllele == 'G' && altAllele == 'A')
                || (refAllele == 'C' && altAllele == 'T') || (refAllele == 'T' && altAllele == 'C');
    }

    /**
     * Returns a newly allocated VC that is the same as VC, but without genotypes
     * @param vc  variant context
     * @return  new VC without genotypes
     */
    public static VariantContext sitesOnlyVariantContext(VariantContext vc) {
        return new VariantContextBuilder(vc).noGenotypes().make();
    }

    /**
     * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes
     * @param vcs  collection of VCs
     * @return new VCs without genotypes
     */
    public static Collection<VariantContext> sitesOnlyVariantContexts(Collection<VariantContext> vcs) {
        List<VariantContext> r = new ArrayList<VariantContext>();
        for (VariantContext vc : vcs)
            r.add(sitesOnlyVariantContext(vc));
        return r;
    }

    public static int getSize(VariantContext vc) {
        return vc.getEnd() - vc.getStart() + 1;
    }

    public static Set<String> genotypeNames(final Collection<Genotype> genotypes) {
        final Set<String> names = new HashSet<String>(genotypes.size());
        for (final Genotype g : genotypes)
            names.add(g.getSampleName());
        return names;
    }

    /**
     * Compute the end position for this VariantContext from the alleles themselves
     *
     * In the trivial case this is a single BP event and end = start (open intervals)
     * In general the end is start + ref length - 1, handling the case where ref length == 0
     * However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases
     *
     * @param alleles the list of alleles to consider.  The reference allele must be the first one
     * @param start the known start position of this event
     * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic.  Can be -1
     *                              if no is expected but will throw an error if one is found
     * @return this builder
     */
    public static int computeEndFromAlleles(final List<Allele> alleles, final int start,
            final int endForSymbolicAlleles) {
        final Allele ref = alleles.get(0);

        if (ref.isNonReference())
            throw new IllegalStateException("computeEndFromAlleles requires first allele to be reference");

        if (VariantContext.hasSymbolicAlleles(alleles)) {
            if (endForSymbolicAlleles == -1)
                throw new IllegalStateException(
                        "computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided");
            return endForSymbolicAlleles;
        } else {
            return start + Math.max(ref.length() - 1, 0);
        }
    }

}