org.opencb.opencga.storage.hadoop.variant.converters.HBaseToVariantConverter.java Source code

Java tutorial

Introduction

Here is the source code for org.opencb.opencga.storage.hadoop.variant.converters.HBaseToVariantConverter.java

Source

/*
 * Copyright 2015-2016 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.opencga.storage.hadoop.variant.converters;

import com.google.common.collect.BiMap;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.client.Result;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AlternateCoordinate;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.models.variant.protobuf.VariantProto;
import org.opencb.biodata.models.variant.stats.VariantStats;
import org.opencb.biodata.tools.variant.converters.Converter;
import org.opencb.biodata.tools.variant.merge.VariantMerger;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.adaptors.VariantField;
import org.opencb.opencga.storage.hadoop.variant.GenomeHelper;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableHelper;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableStudyRow;
import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseStudyConfigurationManager;
import org.opencb.opencga.storage.hadoop.variant.converters.annotation.HBaseToVariantAnnotationConverter;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.converters.stats.HBaseToVariantStatsConverter;
import org.opencb.opencga.storage.hadoop.variant.models.protobuf.SampleList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.Map.Entry;

/**
 * Created on 20/11/15.
 *
 * @author Jacobo Coll <jacobo167@gmail.com>
 */
public class HBaseToVariantConverter implements Converter<Result, Variant> {

    private final StudyConfigurationManager scm;
    private final HBaseToVariantAnnotationConverter annotationConverter;
    private final HBaseToVariantStatsConverter statsConverter;
    private final GenomeHelper genomeHelper;
    private final QueryOptions scmOptions = new QueryOptions(StudyConfigurationManager.READ_ONLY, true)
            .append(StudyConfigurationManager.CACHED, true);
    private final Map<Integer, LinkedHashMap<String, Integer>> returnedSamplesPositionMap = new HashMap<>();
    private final Logger logger = LoggerFactory.getLogger(HBaseToVariantConverter.class);

    private List<String> returnedSamples = null;

    private static boolean failOnWrongVariants = false; //FIXME
    private boolean studyNameAsStudyId = false;
    private boolean mutableSamplesPosition = true;
    private boolean failOnEmptyVariants = false;
    private boolean simpleGenotypes = false;
    private Set<VariantField> variantFields = null;

    public HBaseToVariantConverter(VariantTableHelper variantTableHelper) throws IOException {
        this(variantTableHelper, new HBaseStudyConfigurationManager(variantTableHelper.getOutputTableAsString(),
                variantTableHelper.getConf(), new ObjectMap()));
    }

    public HBaseToVariantConverter(GenomeHelper genomeHelper, StudyConfigurationManager scm) {
        this.genomeHelper = genomeHelper;
        this.scm = scm;
        this.annotationConverter = new HBaseToVariantAnnotationConverter(genomeHelper);
        this.statsConverter = new HBaseToVariantStatsConverter(genomeHelper);
    }

    public HBaseToVariantConverter setReturnedSamples(List<String> returnedSamples) {
        this.returnedSamples = returnedSamples;
        return this;
    }

    public HBaseToVariantConverter setReturnedFields(Set<VariantField> fields) {
        variantFields = fields;
        annotationConverter.setReturnedFields(fields);
        return this;
    }

    public HBaseToVariantConverter setStudyNameAsStudyId(boolean studyNameAsStudyId) {
        this.studyNameAsStudyId = studyNameAsStudyId;
        return this;
    }

    public HBaseToVariantConverter setMutableSamplesPosition(boolean mutableSamplesPosition) {
        this.mutableSamplesPosition = mutableSamplesPosition;
        return this;
    }

    public HBaseToVariantConverter setFailOnEmptyVariants(boolean failOnEmptyVariants) {
        this.failOnEmptyVariants = failOnEmptyVariants;
        return this;
    }

    public HBaseToVariantConverter setSimpleGenotypes(boolean simpleGenotypes) {
        this.simpleGenotypes = simpleGenotypes;
        return this;
    }

    @Override
    public Variant convert(Result result) {
        VariantAnnotation annotation = annotationConverter.convert(result);
        Map<Integer, Map<Integer, VariantStats>> stats = statsConverter.convert(result);
        return convert(genomeHelper.extractVariantFromVariantRowKey(result.getRow()),
                VariantTableStudyRow.parse(result, genomeHelper), stats, annotation);
    }

    public Variant convert(ResultSet resultSet) throws SQLException {
        Variant variant = new Variant(resultSet.getString(VariantPhoenixHelper.VariantColumn.CHROMOSOME.column()),
                resultSet.getInt(VariantPhoenixHelper.VariantColumn.POSITION.column()),
                resultSet.getString(VariantPhoenixHelper.VariantColumn.REFERENCE.column()),
                resultSet.getString(VariantPhoenixHelper.VariantColumn.ALTERNATE.column()));
        String type = resultSet.getString(VariantPhoenixHelper.VariantColumn.TYPE.column());
        if (StringUtils.isNotBlank(type)) {
            variant.setType(VariantType.valueOf(type));
        }
        try {
            Map<Integer, Map<Integer, VariantStats>> stats = statsConverter.convert(resultSet);
            VariantAnnotation annotation = annotationConverter.convert(resultSet);
            return convert(variant, VariantTableStudyRow.parse(variant, resultSet, genomeHelper), stats,
                    annotation);
        } catch (RuntimeException e) {
            logger.error("Fail to parse variant: " + variant);
            throw e;
        }
    }

    public Variant convert(VariantTableStudyRow row) {
        return convert(new Variant(row.getChromosome(), row.getPos(), row.getRef(), row.getAlt()),
                Collections.singletonList(row), Collections.emptyMap(), null);

    }

    protected Variant convert(Variant variant, List<VariantTableStudyRow> rows,
            Map<Integer, Map<Integer, VariantStats>> stats, VariantAnnotation annotation) {
        if (annotation == null) {
            annotation = new VariantAnnotation();
            annotation.setConsequenceTypes(Collections.emptyList());
        }
        if (failOnEmptyVariants && rows.isEmpty()) {
            throw new IllegalStateException("No Row columns supplied for row " + variant);
        }
        for (VariantTableStudyRow row : rows) {
            Map<String, String> attributesMap = new HashMap<>();
            Integer studyId = row.getStudyId();
            QueryResult<StudyConfiguration> queryResult = scm.getStudyConfiguration(studyId, scmOptions);
            if (queryResult.getResult().isEmpty()) {
                throw new IllegalStateException("No study found for study ID: " + studyId);
            }
            StudyConfiguration studyConfiguration = queryResult.first();

            LinkedHashMap<String, Integer> returnedSamplesPosition = getReturnedSamplesPosition(studyConfiguration);
            if (mutableSamplesPosition) {
                returnedSamplesPosition = new LinkedHashMap<>(returnedSamplesPosition);
            }
            //            Do not throw any exception. It may happen that the study is not loaded yet or no samples are required!
            //            if (returnedSamplesPosition.isEmpty()) {
            //                throw new IllegalStateException("No samples found for study!!!");
            //            }

            BiMap<String, Integer> loadedSamples = StudyConfiguration.getIndexedSamples(studyConfiguration);

            List<String> format = Arrays.asList(VariantMerger.GT_KEY, VariantMerger.GENOTYPE_FILTER_KEY);
            int gtIdx = format.indexOf(VariantMerger.GT_KEY);
            int ftIdx = format.indexOf(VariantMerger.GENOTYPE_FILTER_KEY);

            int loadedSamplesSize = loadedSamples.size();
            calculatePassCallRates(row, attributesMap, loadedSamplesSize);

            Integer nSamples = returnedSamplesPosition.size();

            @SuppressWarnings("unchecked")
            List<String>[] samplesDataArray = new List[nSamples];

            Set<Integer> sampleWithVariant = new HashSet<>();
            BiMap<Integer, String> mapSampleIds = studyConfiguration.getSampleIds().inverse();
            for (String genotype : row.getGenotypes()) {
                sampleWithVariant.addAll(row.getSampleIds(genotype));
                if (genotype.equals(VariantTableStudyRow.OTHER)) {
                    continue; // skip OTHER -> see Complex type
                }
                for (Integer sampleId : row.getSampleIds(genotype)) {
                    String sampleName = mapSampleIds.get(sampleId);
                    Integer sampleIdx = returnedSamplesPosition.get(sampleName);
                    if (sampleIdx == null) {
                        continue; //Sample may not be required. Ignore this sample.
                    }
                    List<String> lst = Arrays.asList(genotype, VariantMerger.PASS_VALUE);
                    samplesDataArray[sampleIdx] = lst;
                }
            }

            // Load Secondary Index
            List<VariantProto.AlternateCoordinate> s2cgt = row.getComplexVariant().getSecondaryAlternatesList();
            int secondaryAlternatesCount = row.getComplexVariant().getSecondaryAlternatesCount();
            List<AlternateCoordinate> secAltArr = new ArrayList<AlternateCoordinate>(secondaryAlternatesCount);
            if (secondaryAlternatesCount > 0) {
                for (VariantProto.AlternateCoordinate altcoord : s2cgt) {
                    VariantType vart = VariantType.valueOf(altcoord.getType().name());
                    String chr = StringUtils.isEmpty(altcoord.getChromosome()) ? variant.getChromosome()
                            : altcoord.getChromosome();
                    Integer start = altcoord.getStart() == 0 ? variant.getStart() : altcoord.getStart();
                    Integer end = altcoord.getEnd() == 0 ? variant.getEnd() : altcoord.getEnd();
                    String reference = StringUtils.isEmpty(altcoord.getReference()) ? "" : altcoord.getReference();
                    String alternate = StringUtils.isEmpty(altcoord.getAlternate()) ? "" : altcoord.getAlternate();
                    AlternateCoordinate alt = new AlternateCoordinate(chr, start, end, reference, alternate, vart);
                    secAltArr.add(alt);
                }
            }
            // Load complex genotypes
            for (Entry<Integer, String> entry : row.getComplexVariant().getSampleToGenotype().entrySet()) {
                sampleWithVariant.add(entry.getKey());
                Integer samplePosition = getSamplePosition(returnedSamplesPosition, mapSampleIds, entry.getKey());
                if (samplePosition == null) {
                    continue; //Sample may not be required. Ignore this sample.
                }
                String genotype = entry.getValue();
                String returnedGenotype;
                // FIXME: Decide what to do with lists of genotypes
                if (simpleGenotypes) {
                    returnedGenotype = getSimpleGenotype(genotype);
                    logger.debug("Return simplified genotype: {} -> {}", genotype, returnedGenotype);
                } else {
                    returnedGenotype = genotype;
                }
                samplesDataArray[samplePosition] = Arrays.asList(returnedGenotype, VariantMerger.PASS_VALUE);
            }

            // Fill gaps (with HOM_REF)
            int gapCounter = 0;
            for (int i = 0; i < samplesDataArray.length; i++) {
                if (samplesDataArray[i] == null) {
                    ++gapCounter;
                    samplesDataArray[i] = Arrays.asList(VariantTableStudyRow.HOM_REF, VariantMerger.PASS_VALUE);
                }
            }

            // Set pass field
            int passCount = loadedSamplesSize;
            for (Entry<String, SampleList> entry : row.getComplexFilter().getFilterNonPass().entrySet()) {
                String filterString = entry.getKey();
                passCount -= entry.getValue().getSampleIdsCount();
                for (Integer id : entry.getValue().getSampleIdsList()) {
                    Integer samplePosition = getSamplePosition(returnedSamplesPosition, mapSampleIds, id);
                    if (samplePosition == null) {
                        continue; // Sample may not be required. Ignore this sample.
                    }
                    samplesDataArray[samplePosition].set(ftIdx, filterString);
                }
            }

            // Check pass count
            if (passCount != row.getPassCount()) {
                String message = String.format(
                        "Error parsing variant %s. Pass count %s does not match filter fill count: %s using %s loaded samples.",
                        row.toString(), row.getPassCount(), passCount, loadedSamplesSize);
                wrongVariant(message);
            }

            // Check homRef count
            int homRefCount = loadedSamplesSize;
            homRefCount -= sampleWithVariant.size();
            if (homRefCount != row.getHomRefCount()) {
                String message = "Wrong number of HomRef samples for variant " + variant + ". Got " + homRefCount
                        + ", expect " + row.getHomRefCount() + ". Samples number: " + samplesDataArray.length
                        + " , ";
                message += "'" + VariantTableStudyRow.HOM_REF + "':" + row.getHomRefCount() + " , ";
                for (String studyColumn : VariantTableStudyRow.GENOTYPE_COLUMNS) {
                    message += "'" + studyColumn + "':" + row.getSampleIds(studyColumn) + " , ";
                }
                wrongVariant(message);
            }

            List<List<String>> samplesData = Arrays.asList(samplesDataArray);

            StudyEntry studyEntry;
            if (studyNameAsStudyId) {
                studyEntry = new StudyEntry(studyConfiguration.getStudyName());
            } else {
                studyEntry = new StudyEntry(Integer.toString(studyConfiguration.getStudyId()));
            }
            studyEntry.setSortedSamplesPosition(returnedSamplesPosition);
            studyEntry.setSamplesData(samplesData);
            studyEntry.setFormat(format);
            studyEntry.setFiles(Collections.singletonList(new FileEntry("", "", attributesMap)));
            studyEntry.setSecondaryAlternates(secAltArr);

            Map<Integer, VariantStats> convertedStatsMap = stats.get(studyConfiguration.getStudyId());
            if (convertedStatsMap != null) {
                Map<String, VariantStats> statsMap = new HashMap<>(convertedStatsMap.size());
                for (Entry<Integer, VariantStats> entry : convertedStatsMap.entrySet()) {
                    String cohortName = studyConfiguration.getCohortIds().inverse().get(entry.getKey());
                    statsMap.put(cohortName, entry.getValue());
                }
                studyEntry.setStats(statsMap);
            }

            variant.addStudyEntry(studyEntry);
        }
        variant.setAnnotation(annotation);
        if (StringUtils.isNotEmpty(annotation.getId())) {
            variant.setId(annotation.getId());
        } else {
            variant.setId(variant.toString());
        }
        if (failOnEmptyVariants && variant.getStudies().isEmpty()) {
            throw new IllegalStateException("No Studies registered for variant!!! " + variant);
        }
        return variant;
    }

    private void calculatePassCallRates(VariantTableStudyRow row, Map<String, String> attributesMap,
            int loadedSamplesSize) {
        attributesMap.put("PASS", row.getPassCount().toString());
        attributesMap.put("CALL", row.getCallCount().toString());
        double passRate = row.getPassCount().doubleValue() / loadedSamplesSize;
        double callRate = row.getCallCount().doubleValue() / loadedSamplesSize;
        double opr = passRate * callRate;
        attributesMap.put("PR", String.valueOf(passRate));
        attributesMap.put("CR", String.valueOf(callRate));
        attributesMap.put("OPR", String.valueOf(opr)); // OVERALL pass rate
        attributesMap.put("NS", String.valueOf(loadedSamplesSize)); // Number of Samples
    }

    private String getSimpleGenotype(String genotype) {
        if (genotype.contains(",")) {
            return genotype.split(",")[0];
        } else {
            return genotype;
        }
    }

    private void wrongVariant(String message) {
        if (failOnWrongVariants) {
            throw new IllegalStateException(message);
        } else {
            logger.warn(message);
        }
    }

    private Integer getSamplePosition(LinkedHashMap<String, Integer> returnedSamplesPosition,
            BiMap<Integer, String> mapSampleIds, Integer sampleId) {
        String sampleName = mapSampleIds.get(sampleId);
        Integer samplePosition = returnedSamplesPosition.get(sampleName);
        return samplePosition;
    }

    /**
     * Creates a SORTED MAP with the required samples position.
     *
     * @param studyConfiguration Study Configuration
     * @return Sorted linked hash map
     */
    private LinkedHashMap<String, Integer> getReturnedSamplesPosition(StudyConfiguration studyConfiguration) {
        if (!returnedSamplesPositionMap.containsKey(studyConfiguration.getStudyId())) {
            LinkedHashMap<String, Integer> samplesPosition = StudyConfiguration.getReturnedSamplesPosition(
                    studyConfiguration, returnedSamples == null ? null : new LinkedHashSet<>(returnedSamples),
                    StudyConfiguration::getIndexedSamples);
            returnedSamplesPositionMap.put(studyConfiguration.getStudyId(), samplesPosition);
        }
        return returnedSamplesPositionMap.get(studyConfiguration.getStudyId());
    }

    public static boolean isFailOnWrongVariants() {
        return failOnWrongVariants;
    }

    public static void setFailOnWrongVariants(boolean b) {
        failOnWrongVariants = b;
    }
}