org.icgc.dcc.release.job.annotate.converter.SnpEffVCFToICGCConverter.java Source code

Introduction

Here is the source code for org.icgc.dcc.release.job.annotate.converter.SnpEffVCFToICGCConverter.java
Source

/*
 * Copyright (c) 2014 The Ontario Institute for Cancer Research. All rights reserved.                             
 *                                                                                                               
 * This program and the accompanying materials are made available under the terms of the GNU Public License v3.0.
 * You should have received a copy of the GNU General Public License along with                                  
 * this program. If not, see <http://www.gnu.org/licenses/>.                                                     
 *                                                                                                               
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY                           
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES                          
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT                           
 * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                                
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED                          
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;                               
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER                              
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN                         
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.icgc.dcc.release.job.annotate.converter;

import static com.google.common.base.Strings.isNullOrEmpty;
import static lombok.AccessLevel.PRIVATE;
import static org.icgc.dcc.common.core.model.SpecialValue.NO_VALUE;
import static org.icgc.dcc.release.job.annotate.model.ParseNotification.WARNING_REF_DOES_NOT_MATCH_GENOME;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import lombok.Getter;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.val;
import lombok.extern.slf4j.Slf4j;

import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.icgc.dcc.release.job.annotate.model.AnnotatedFileType;
import org.icgc.dcc.release.job.annotate.model.SecondaryEntity;
import org.icgc.dcc.release.job.annotate.model.SnpEffect;
import org.icgc.dcc.release.job.annotate.parser.SnpEffectParser;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.Collections2;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;

/**
 * Converts snpEff annotated variants to a list of {@link SecondaryEntity}
 */
@Slf4j
@RequiredArgsConstructor
public class SnpEffVCFToICGCConverter {

    public static final String INFO_EFF_FIELD = "EFF";
    private static final String INFO_PRIM_FIELD = "PRIM";

    /**
     * proteinDomainAffected does not have data to be populated. See SsmSecondaryRecord class for more details
     */
    private static final String PROTEIN_AFFECTED_DOMAIN_VALUE = null;
    private static final String MISSING_DATA = (String) NO_VALUE;
    private static final String SSM_NOTE = MISSING_DATA;

    @NonNull
    private final String geneBuildVersion;

    public List<SecondaryEntity> convert(VariantContext variant, AnnotatedFileType fileType) {
        if (!variant.hasAttribute(INFO_PRIM_FIELD)) {
            log.warn("The unique identificator is missing in variant {}", variant);

            return Collections.emptyList();
        }

        val effects = retrieveUniqueCancerEffects(variant);

        return createSecondaryEntities(effects, geneBuildVersion, variant);
    }

    private static List<SecondaryEntity> createSecondaryEntities(Collection<SnpEffect> effects,
            String geneBuildVersion, VariantContext variant) {
        val result = new ImmutableList.Builder<SecondaryEntity>();

        for (val effect : effects) {
            result.add(createSecondaryEntity(effect, geneBuildVersion,
                    getObservationId(variant.getAttribute(INFO_PRIM_FIELD))));
        }

        return result.build();
    }

    private static String getObservationId(Object observationId) {
        @SuppressWarnings("unchecked")
        val observationIdList = (observationId instanceof List) ? (List<String>) observationId
                : Arrays.asList((String) observationId);

        return observationIdList.get(0);
    }

    private static SecondaryEntity createSecondaryEntity(SnpEffect effect, String genBuildVersion, String id) {
        return SecondaryEntity.builder().consequenceType(getValue(effect.getConsequenceType().getConsequenceName()))
                .aaMutation(getValue(effect.getAminoAcidChange())).cdsMutation(getValue(effect.getCodonChange()))
                .proteinDomainAffected(getValue(PROTEIN_AFFECTED_DOMAIN_VALUE))
                .geneAffected(getValue(effect.getGeneName())).transcriptAffected(getValue(effect.getTranscriptID()))
                .geneBuildVersion(genBuildVersion).note(SSM_NOTE).observationId(id).build();
    }

    private static String getValue(String input) {
        return (isNullOrEmpty(input)) ? MISSING_DATA : input;
    }

    /**
     * Parses {@code variant} and returns a collections of unique effects sorted by priority and limited by the most
     * important.
     */
    private static Collection<SnpEffect> retrieveUniqueCancerEffects(VariantContext variant) {
        log.debug("Processing variant: {}", variant);
        if (!variant.hasAttribute(INFO_EFF_FIELD)) {
            log.warn("No snpEff annotation found in variant. Skipping. Variant: {}", variant);

            return Collections.emptyList();
        }

        val effects = extractEffects(variant);
        val result = new ImmutableList.Builder<SnpEffect>();

        for (val effectAnnotation : effects) {
            result.addAll(filterMalformedEffects(SnpEffectParser.parse(effectAnnotation), variant));
        }

        return filterEffects(variant, result.build());
    }

    /**
     * Filters out malformed {@link SnpEffect}s.
     * 
     * @return a list of valid effects
     */
    private static List<SnpEffect> filterMalformedEffects(List<SnpEffect> effects, VariantContext variant) {
        val result = new ImmutableList.Builder<SnpEffect>();

        for (val effect : effects) {
            if (isValidEffect(effect, variant)) {
                result.add(effect);
            }
        }

        return result.build();
    }

    private static boolean isValidEffect(SnpEffect effect, VariantContext variant) {
        if (effect.hasError()) {
            // FIXME: [DCC-2578] Confirm with Junjun when a malformed effect can be still added to a ssm_s.txt.
            if (effect.containsAnyError(WARNING_REF_DOES_NOT_MATCH_GENOME)) {
                log.error("Skipping malformed effect: '{}'", effect);

                return false;
            }
            // TODO: Change to other level once this information is required for data mining etc.
            log.debug("Adding effect with warning or error: '{}'", effect);
        }

        return true;
    }

    /**
     * Retrieves all annotated variants from the INFO field. Returns as a list of individual annotations. Each list entity
     * is an unparsed (not broken down to fields) snpEff annotation.
     */
    @SuppressWarnings("unchecked")
    private static List<String> extractEffects(VariantContext variant) {
        val allEffects = variant.getAttribute(INFO_EFF_FIELD);

        return (allEffects instanceof List) ? (List<String>) allEffects : Arrays.asList((String) allEffects);
    }

    /**
     * Groups annotated effects at the transcript or gene level if the transcript one is not available. Selects one effect
     * from the group based on the {@link ConsequenceType}<br>
     * <br>
     * Skips variations which are not not mutations. E.g. controlSample equal to tumourSample (same nucleotide mutated
     * into the same nucleotide).
     * @param variant - annotated variant
     * @param individualEffects - effects retrieved from the {@code vc}
     * @return effects sorted by priority and limited by the most important
     * @see <a href="https://wiki.oicr.on.ca/x/sg6RAw">Effect selection criterias</a>
     */
    private static Collection<SnpEffect> filterEffects(VariantContext variant, List<SnpEffect> individualEffects) {
        val controlGenotype = parseGenotype(variant, SampleType.CONTROL_SAMPLE.getName());
        val tumourGenotype = parseGenotype(variant, SampleType.DONOR_SAMPLE.getName());

        if (!isMutation(controlGenotype, tumourGenotype)) {
            log.warn("No mutations found based on the genotype info. Variant: {}", variant);

            return Collections.emptyList();
        }

        val predictedGenotypeNumber = predictGenotypeNumber(controlGenotype, tumourGenotype);
        val uniqueEffects = filterMatchingEffects(individualEffects, predictedGenotypeNumber);
        val transcriptIdMap = groupEffects(uniqueEffects);
        val result = getFilteredEffects(transcriptIdMap);

        if (result.isEmpty()) {
            log.warn("No effect found for this mutation. Skipping. Variant: {}", variant);
        }

        return result;
    }

    /**
     * Checks if {@code mutationFrom} and {@code mutationTo} represent a mutation.<br>
     * <br>
     * It is not a mutation if both arguments the same. There is no mutation in representation A>A. Must be something like
     * A>G. What means allele A mutated to allele G.
     */
    // TODO: move this to ssm_s to vcf generation class. It's more efficient to check this before the annotation step.
    private static boolean isMutation(String mutationFrom, String mutationTo) {
        return !mutationFrom.equals(mutationTo);
    }

    /**
     * Gets genotype (control or sample) from {@code variant} by {@code genotypeName}. Transforms its name to numeric
     * format which is generated by snpEff. The genotype number corresponds to the allele position in REF or ALT fields in
     * the VCF file. The method returns a single position number of this genotype.
     * 
     * We know that left and right sides of a genotype are always equal, because this is enforced by the submission
     * validator and it looks like "0/0  1/1". So, the method will return either "0" or "1" depending on the
     * {@code genotypeName}
     */
    private static String parseGenotype(VariantContext variant, String genotypeName) {
        val genotype = variant.getGenotype(genotypeName);
        checkPhasing(genotype);

        return transformGenotypeNotion(variant, genotype).get(0);
    }

    /**
     * Converts {@code genotype}, which looks like [T/T] to VCF format. E.g. 1/1
     */
    private static List<String> transformGenotypeNotion(VariantContext variant, Genotype genotype) {
        val result = new ImmutableList.Builder<String>();
        val leftAlleleIndex = 0;
        val rightAlleleIndex = 1;
        result.add(String.valueOf(variant.getAlleleIndex(genotype.getAllele(leftAlleleIndex))));
        result.add(String.valueOf(variant.getAlleleIndex(genotype.getAllele(rightAlleleIndex))));

        return result.build();
    }

    private static void checkPhasing(Genotype genotype) {
        if (genotype.isPhased()) {
            throw new UnsupportedOperationException(
                    "Phased genotype is not currently supported. Genotype: " + genotype);
        }
    }

    /**
     * Returns a set of effects sorted by priority and limited by 1
     */
    private static Set<SnpEffect> getFilteredEffects(Multimap<String, SnpEffect> transcriptIdMap) {
        val result = new ImmutableSet.Builder<SnpEffect>();
        for (val transcriptId : transcriptIdMap.keySet()) {
            val effectGroup = Lists.newArrayList(transcriptIdMap.get(transcriptId));
            // order by importance
            Collections.sort(effectGroup, Collections.reverseOrder());
            // limit by 1
            result.add(effectGroup.get(0));
        }

        return result.build();
    }

    /**
     * Groups {@code effects} by transcript or by gene affected if transcript is unavailable.
     */
    private static Multimap<String, SnpEffect> groupEffects(Collection<SnpEffect> effects) {
        Multimap<String, SnpEffect> transcriptIdMap = Multimaps.index(effects, new Function<SnpEffect, String>() {

            @Override
            public String apply(@NonNull SnpEffect item) {
                return !item.getTranscriptID().isEmpty() ? item.getTranscriptID() : item.getGeneName();
            }

        });

        return transcriptIdMap;
    }

    /**
     * Filters {@code individualEffects}. Returns only those that whose {@code cancerID} matches {@code mutationSet}
     */
    private static Collection<SnpEffect> filterMatchingEffects(List<SnpEffect> individualEffects,
            final String mutationSet) {
        Collection<SnpEffect> uniqueEffects = Collections2.filter(individualEffects, new Predicate<SnpEffect>() {

            @Override
            public boolean apply(SnpEffect effect) {
                if (effect != null && mutationSet.equals(effect.getCancerID())) {
                    return true;
                }

                return false;
            }

        });
        return uniqueEffects;
    }

    /**
     * Predicts what Genotype_Number should be valid for the {@code control} / {@code tumour} combination
     */
    private static String predictGenotypeNumber(String control, String tumour) {
        val separator = "-";
        val referenceAllele = "0";

        if (control.equals(referenceAllele)) {
            return tumour;
        } else {
            return tumour + separator + control;
        }
    }

    @Getter
    @RequiredArgsConstructor(access = PRIVATE)
    public static enum SampleType {
        CONTROL_SAMPLE("Patient_01_Germline"), DONOR_SAMPLE("Patient_01_Somatic");

        @NonNull
        private final String name;

    }

}