org.opencb.opencga.storage.mongodb.variant.DBObjectToSamplesConverter.java Source code

Java tutorial

Introduction

Here is the source code for org.opencb.opencga.storage.mongodb.variant.DBObjectToSamplesConverter.java

Source

/*
 * Copyright 2015 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.opencga.storage.mongodb.variant;

import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;

import java.util.*;
import java.util.stream.Collectors;

import org.opencb.biodata.models.feature.Genotype;
import org.opencb.biodata.models.variant.VariantSourceEntry;

import org.opencb.datastore.core.QueryOptions;
import org.opencb.datastore.core.QueryResult;
import org.opencb.opencga.storage.core.StudyConfiguration;
import org.opencb.opencga.storage.core.variant.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.adaptors.VariantSourceDBAdaptor;
import org.slf4j.LoggerFactory;

import static org.opencb.opencga.storage.mongodb.variant.DBObjectToVariantSourceEntryConverter.FILEID_FIELD;
import static org.opencb.opencga.storage.mongodb.variant.DBObjectToVariantSourceEntryConverter.GENOTYPES_FIELD;
import static org.opencb.opencga.storage.mongodb.variant.DBObjectToVariantSourceEntryConverter.STUDYID_FIELD;

/**
 *
 * @author Cristina Yenyxe Gonzalez Garcia <cyenyxe@ebi.ac.uk>
 */
public class DBObjectToSamplesConverter /*implements ComplexTypeConverter<VariantSourceEntry, DBObject>*/ {

    public static final String UNKNOWN_GENOTYPE = "?/?";

    private final Map<Integer, StudyConfiguration> studyConfigurations;
    private final Map<Integer, Map<String, Integer>> __studySamplesId; //Inverse map from "sampleIds". Do not use directly, can be null. Use "getIndexedIdSamplesMap()"
    private final Map<Integer, Map<Integer, String>> __studyIdSamples; //Inverse map from "sampleIds". Do not use directly, can be null. Use "getIndexedIdSamplesMap()"
    private final Map<Integer, Set<String>> studyDefaultGenotypeSet;
    private Set<String> returnedSamples;
    private VariantSourceDBAdaptor sourceDbAdaptor;
    private StudyConfigurationManager studyConfigurationManager;
    private String returnedUnknownGenotype;

    public static final org.slf4j.Logger logger = LoggerFactory
            .getLogger(DBObjectToSamplesConverter.class.getName());

    /**
     * Create a converter from a Map of samples to DBObject entities.
     **/
    DBObjectToSamplesConverter() {
        studyConfigurations = new HashMap<>();
        __studySamplesId = new HashMap<>();
        __studyIdSamples = new HashMap<>();
        studyDefaultGenotypeSet = new HashMap<>();
        returnedSamples = new HashSet<>();
        studyConfigurationManager = null;
        returnedUnknownGenotype = null;
    }

    /**
     * Create a converter from DBObject to a Map of samples, providing the list
     * of sample names.
     *
     * @param samples The list of samples, if any
     * @param defaultGenotype
     */
    public DBObjectToSamplesConverter(int studyId, List<String> samples, String defaultGenotype) {
        this();
        setSamples(studyId, samples);
        studyConfigurations.get(studyId).getAttributes().put(MongoDBVariantStorageManager.DEFAULT_GENOTYPE,
                Collections.singleton(defaultGenotype));
        studyDefaultGenotypeSet.put(studyId, Collections.singleton(defaultGenotype));
    }

    /**
     *
     */
    public DBObjectToSamplesConverter(StudyConfigurationManager studyConfigurationManager,
            VariantSourceDBAdaptor variantSourceDBAdaptor) {
        this();
        this.sourceDbAdaptor = variantSourceDBAdaptor;
        this.studyConfigurationManager = studyConfigurationManager;
    }

    /**
     *
     */
    public DBObjectToSamplesConverter(StudyConfiguration studyConfiguration) {
        this(Collections.singletonList(studyConfiguration));
    }

    /**
     *
     */
    public DBObjectToSamplesConverter(List<StudyConfiguration> studyConfigurations) {
        this();
        studyConfigurations.forEach(this::addStudyConfiguration);
    }

    //    @Override
    public Map<String, Map<String, String>> convertToDataModelType(DBObject object, int studyId) {
        //        Integer studyId = Integer.parseInt(object.get(STUDYID_FIELD).toString());
        //        Integer studyId = Integer.parseInt(studyIdStr);
        if (!studyConfigurations.containsKey(studyId) && studyConfigurationManager != null) { // Samples not set as constructor argument, need to query
            QueryResult<StudyConfiguration> queryResult = studyConfigurationManager.getStudyConfiguration(studyId,
                    null);
            if (queryResult.first() == null) {
                logger.warn(
                        "DBObjectToSamplesConverter.convertToDataModelType StudyConfiguration {studyId: {}} not found! Looking for VariantSource",
                        studyId);

                if (sourceDbAdaptor != null) {
                    QueryResult samplesBySource = sourceDbAdaptor
                            .getSamplesBySource(object.get(FILEID_FIELD).toString(), null);
                    if (samplesBySource.getResult().isEmpty()) {
                        logger.warn(
                                "DBObjectToSamplesConverter.convertToDataModelType VariantSource not found! Can't read sample names");
                    } else {
                        setSamples(studyId, (List<String>) samplesBySource.getResult().get(0));
                    }
                }
            } else {
                addStudyConfiguration(queryResult.first());
            }
            //            QueryResult samplesBySource = sourceDbAdaptor.getSamplesBySource(
            //                    object.get(FILEID_FIELD).toString(), null);
            //            if(samplesBySource.getResult().isEmpty()) {
            //                System.out.println("DBObjectToSamplesConverter.convertToDataModelType " + samplesBySource);
            //                sampleIds = null;
            //                idSamples = null;
            //            } else {
            //                setSamples((List<String>) samplesBySource.getResult().get(0));
            //            }
        }

        if (!studyConfigurations.containsKey(studyId)) {
            return Collections.emptyMap();
        }

        StudyConfiguration studyConfiguration = studyConfigurations.get(studyId);
        Map<String, Integer> sampleIds = getIndexedSamplesIdMap(studyId);
        if (sampleIds == null || sampleIds.isEmpty()) {
            return Collections.emptyMap();
        }

        BasicDBObject mongoGenotypes = (BasicDBObject) object.get(GENOTYPES_FIELD);
        //        int numSamples = idSamples.size();

        // Temporary file, just to store the samples
        //        VariantSourceEntry fileWithSamples = new VariantSourceEntry(fileId.toString(), studyId.toString());
        Map<String, Map<String, String>> samplesData = new LinkedHashMap<>();//new VariantSourceEntry(fileId.toString(), studyId.toString());

        // Add the samples to the file
        for (Map.Entry<String, Integer> entry : sampleIds.entrySet()) {
            samplesData.put(entry.getKey(), new HashMap<>(1)); //size == 1, only will contain GT field
        }

        // An array of genotypes is initialized with the most common one
        //        String mostCommonGtString = mongoGenotypes.getString("def");
        Set<String> defaultGenotypes = studyDefaultGenotypeSet.get(studyId);
        String mostCommonGtString = defaultGenotypes.isEmpty() ? null : defaultGenotypes.iterator().next();
        if (UNKNOWN_GENOTYPE.equals(mostCommonGtString)) {
            mostCommonGtString = returnedUnknownGenotype;
        }
        if (mostCommonGtString != null) {
            for (Map.Entry<String, Map<String, String>> entry : samplesData.entrySet()) {
                entry.getValue().put("GT", mostCommonGtString);
            }
        }

        // Loop through the non-most commmon genotypes, and set their defaultValue
        // in the position specified in the array, such as:
        // "0|1" : [ 41, 311, 342, 358, 881, 898, 903 ]
        // genotypes[41], genotypes[311], etc, will be set to "0|1"
        Map idSamples = getIndexedIdSamplesMap(studyId);
        for (Map.Entry<String, Object> dbo : mongoGenotypes.entrySet()) {
            final String genotype;
            if (dbo.getKey().equals(UNKNOWN_GENOTYPE)) {
                if (returnedUnknownGenotype == null) {
                    continue;
                }
                if (defaultGenotypes.contains(returnedUnknownGenotype)) {
                    continue;
                } else {
                    genotype = returnedUnknownGenotype;
                }
            } else {
                genotype = genotypeToDataModelType(dbo.getKey());
            }
            for (Integer sampleId : (List<Integer>) dbo.getValue()) {
                if (idSamples.containsKey(sampleId)) {
                    samplesData.get(idSamples.get(sampleId)).put("GT", genotype);
                }
            }
        }

        return samplesData;
    }

    //    @Override
    public DBObject convertToStorageType(Map<String, Map<String, String>> object, int studyId) {
        Map<Genotype, List<Integer>> genotypeCodes = new HashMap<>();

        //        Integer studyId = Integer.parseInt(object.getStudyId());
        //        Integer studyId = Integer.parseInt(studyIdStr);
        StudyConfiguration studyConfiguration = studyConfigurations.get(studyId);
        Map<String, Integer> sampleIds = studyConfiguration.getSampleIds();
        // Classify samples by genotype
        for (Map.Entry<String, Map<String, String>> sample : object.entrySet()) {
            String genotype = sample.getValue().get("GT");
            if (genotype != null) {
                Genotype g = new Genotype(genotype);
                List<Integer> samplesWithGenotype = genotypeCodes.get(g);
                if (samplesWithGenotype == null) {
                    samplesWithGenotype = new ArrayList<>();
                    genotypeCodes.put(g, samplesWithGenotype);
                }
                samplesWithGenotype.add(sampleIds.get(sample.getKey()));
            }
        }

        Set<Genotype> defaultGenotype = studyDefaultGenotypeSet.get(studyId).stream().map(Genotype::new)
                .collect(Collectors.toSet());

        // In Mongo, samples are stored in a map, classified by their genotype.
        // The most common genotype will be marked as "default" and the specific
        // positions where it is shown will not be stored. Example from 1000G:
        // "def" : 0|0,
        // "0|1" : [ 41, 311, 342, 358, 881, 898, 903 ],
        // "1|0" : [ 262, 290, 300, 331, 343, 369, 374, 391, 879, 918, 930 ]
        BasicDBObject mongoSamples = new BasicDBObject();
        for (Map.Entry<Genotype, List<Integer>> entry : genotypeCodes.entrySet()) {
            String genotypeStr = genotypeToStorageType(entry.getKey().toString());
            if (!defaultGenotype.contains(entry.getKey())) {
                mongoSamples.append(genotypeStr, entry.getValue());
            }
        }

        return mongoSamples;
    }

    public void setSamples(int studyId, List<String> samples) {
        int i = 0;
        int size = samples == null ? 0 : samples.size();
        Map<String, Integer> sampleIds = new HashMap<>(size);
        if (samples != null) {
            for (String sample : samples) {
                sampleIds.put(sample, i);
                i++;
            }
        }
        StudyConfiguration studyConfiguration = new StudyConfiguration(studyId, "",
                Collections.<String, Integer>emptyMap(), sampleIds, Collections.<String, Integer>emptyMap(),
                Collections.<Integer, Set<Integer>>emptyMap());
        addStudyConfiguration(studyConfiguration);
    }

    public void setReturnedSamples(Set<String> returnedSamples) {
        this.returnedSamples = returnedSamples;
        __studyIdSamples.clear();
        __studySamplesId.clear();
    }

    public void addStudyConfiguration(StudyConfiguration studyConfiguration) {
        this.studyConfigurations.put(studyConfiguration.getStudyId(), studyConfiguration);
        this.__studyIdSamples.put(studyConfiguration.getStudyId(), null);
        this.__studySamplesId.put(studyConfiguration.getStudyId(), null);

        Set defGenotypeSet = studyConfiguration.getAttributes().get(MongoDBVariantStorageManager.DEFAULT_GENOTYPE,
                Set.class);
        if (defGenotypeSet == null) {
            List<String> defGenotype = studyConfiguration.getAttributes()
                    .getAsStringList(MongoDBVariantStorageManager.DEFAULT_GENOTYPE);
            if (defGenotype.size() == 0) {
                defGenotypeSet = Collections.<String>emptySet();
            } else if (defGenotype.size() == 1) {
                defGenotypeSet = Collections.singleton(defGenotype.get(0));
            } else {
                defGenotypeSet = new LinkedHashSet<>(defGenotype);
            }
        }
        this.studyDefaultGenotypeSet.put(studyConfiguration.getStudyId(), defGenotypeSet);
    }

    public String getReturnedUnknownGenotype() {
        return returnedUnknownGenotype;
    }

    public void setReturnedUnknownGenotype(String returnedUnknownGenotype) {
        this.returnedUnknownGenotype = returnedUnknownGenotype;
    }

    /**
     * Lazy usage of idSamplesMap. Only inverts map if required
     *
     * @return  Inverts map "sampleIds". From Map<SampleName, SampleId> to Map<SampleId, SampleName>
     */
    private Map getIndexedIdSamplesMap(int studyId) {
        Map<String, Integer> sampleIds = getIndexedSamplesIdMap(studyId);
        if (this.__studyIdSamples.get(studyId) == null) {
            Map<Integer, String> idSamples = StudyConfiguration.inverseMap(sampleIds);
            this.__studyIdSamples.put(studyId, idSamples);
        }

        Map<Integer, String> idSamples = this.__studyIdSamples.get(studyId);
        if (sampleIds.size() != idSamples.size()) {
            throw new IllegalStateException("Invalid sample ids map. SampleIDs must be unique.");
        }
        return idSamples;
    }

    /**
     * Lazy usage of loaded samplesIdMap.
     **/
    private Map<String, Integer> getIndexedSamplesIdMap(int studyId) {
        Map<String, Integer> sampleIds;
        if (this.__studySamplesId.get(studyId) == null) {
            StudyConfiguration studyConfiguration = studyConfigurations.get(studyId);
            sampleIds = StudyConfiguration.getIndexedSamples(studyConfiguration);
            if (!returnedSamples.isEmpty()) {
                sampleIds = sampleIds.entrySet().stream()
                        //ReturnedSamples could be sampleNames or sampleIds as a string
                        .filter(e -> returnedSamples.contains(e.getKey())
                                || returnedSamples.contains(e.getValue().toString()))
                        .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
            }
            this.__studySamplesId.put(studyId, sampleIds);
        } else {
            sampleIds = this.__studySamplesId.get(studyId);
        }

        return sampleIds;
    }

    private VariantSourceEntry getLegacyNoncompressedSamples(BasicDBObject object) {
        VariantSourceEntry variantSourceEntry = new VariantSourceEntry(object.get(FILEID_FIELD).toString(),
                object.get(STUDYID_FIELD).toString());

        BasicDBObject mongoGenotypes = (BasicDBObject) object.get(GENOTYPES_FIELD);
        for (Object entry : mongoGenotypes.toMap().entrySet()) {
            Map.Entry sample = (Map.Entry) entry;
            variantSourceEntry.addSampleData(sample.getKey().toString(), ((DBObject) sample.getValue()).toMap());
        }
        //            for (String sample : samples) {
        //                Map<String, String> sampleData = ((DBObject) mongoGenotypes.get(sample)).toMap();
        //                fileWithSamples.addSampleData(sample, sampleData);
        //                System.out.println("Sample processed: " + sample);
        //            }
        return variantSourceEntry;
    }

    private DBObject getLegacyNoncompressedSamples(VariantSourceEntry object) {
        BasicDBObject mongoSamples = new BasicDBObject();
        for (Map.Entry<String, Map<String, String>> entry : object.getSamplesData().entrySet()) {
            BasicDBObject sampleData = new BasicDBObject();
            for (Map.Entry<String, String> sampleEntry : entry.getValue().entrySet()) {
                sampleData.put(sampleEntry.getKey(), sampleEntry.getValue());
            }
            mongoSamples.put(entry.getKey(), sampleData);
        }
        return mongoSamples;
    }

    public static String genotypeToDataModelType(String genotype) {
        return genotype.replace("-1", ".");
    }

    public static String genotypeToStorageType(String genotype) {
        return genotype.replace(".", "-1");
    }

}