org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantAnnotationConverter.java Source code

Java tutorial

Introduction

Here is the source code for org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantAnnotationConverter.java

Source

/*
 * Copyright 2015-2016 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.opencga.storage.mongodb.variant.converters;

import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.lang3.StringUtils;
import org.bson.Document;
import org.opencb.biodata.models.variant.annotation.ConsequenceTypeMappings;
import org.opencb.biodata.models.variant.avro.*;
import org.opencb.commons.datastore.core.ComplexTypeConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantConverter.ANNOTATION_FIELD;

/**
 * Created by jacobo on 13/01/15.
 */
public class DocumentToVariantAnnotationConverter extends AbstractDocumentConverter
        implements ComplexTypeConverter<VariantAnnotation, Document> {

    public static final String ANNOT_ID_FIELD = "id";

    public static final String GENE_SO_FIELD = "_gn_so";

    public static final String CONSEQUENCE_TYPE_FIELD = "ct";
    public static final String CT_GENE_NAME_FIELD = "gn";
    public static final String CT_ENSEMBL_GENE_ID_FIELD = "ensg";
    public static final String CT_ENSEMBL_TRANSCRIPT_ID_FIELD = "enst";
    public static final String CT_RELATIVE_POS_FIELD = "relPos";
    public static final String CT_CODON_FIELD = "codon";
    public static final String CT_STRAND_FIELD = "strand";
    public static final String CT_BIOTYPE_FIELD = "bt";
    public static final String CT_EXON_NUMBER_FIELD = "exn";
    public static final String CT_TRANSCRIPT_ANNOT_FLAGS = "flags";
    public static final String CT_C_DNA_POSITION_FIELD = "cDnaPos";
    public static final String CT_CDS_POSITION_FIELD = "cdsPos";
    public static final String CT_AA_POSITION_FIELD = "aaPos";
    public static final String CT_AA_REFERENCE_FIELD = "aaRef";
    public static final String CT_AA_ALTERNATE_FIELD = "aaAlt";
    public static final String CT_SO_ACCESSION_FIELD = "so";
    public static final String CT_PROTEIN_KEYWORDS = "kw";
    public static final String CT_PROTEIN_SUBSTITUTION_SCORE_FIELD = "ps_score";
    public static final String CT_PROTEIN_POLYPHEN_FIELD = "polyphen";
    public static final String CT_PROTEIN_SIFT_FIELD = "sift";
    public static final String CT_PROTEIN_FEATURE_FIELD = "pd";
    public static final String CT_PROTEIN_FEATURE_ID_FIELD = "id";
    public static final String CT_PROTEIN_FEATURE_START_FIELD = "start";
    public static final String CT_PROTEIN_FEATURE_END_FIELD = "end";
    public static final String CT_PROTEIN_FEATURE_TYPE_FIELD = "type";
    public static final String CT_PROTEIN_FEATURE_DESCRIPTION_FIELD = "desc";
    public static final String CT_PROTEIN_UNIPROT_ACCESSION = "uni_a";
    public static final String CT_PROTEIN_UNIPROT_NAME = "uni_n";
    public static final String CT_PROTEIN_UNIPROT_VARIANT_ID = "uni_var";

    public static final String XREFS_FIELD = "xrefs";
    public static final String XREF_ID_FIELD = "id";
    public static final String XREF_SOURCE_FIELD = "src";

    public static final String POPULATION_FREQUENCIES_FIELD = "popFq";
    public static final String POPULATION_FREQUENCY_STUDY_FIELD = "study";
    public static final String POPULATION_FREQUENCY_POP_FIELD = "pop";
    public static final String POPULATION_FREQUENCY_REFERENCE_ALLELE_FIELD = "ref";
    public static final String POPULATION_FREQUENCY_ALTERNATE_ALLELE_FIELD = "alt";
    public static final String POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD = "refFq";
    public static final String POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD = "altFq";

    public static final String CONSERVED_REGION_SCORE_FIELD = "cr_score";
    public static final String CONSERVED_REGION_PHYLOP_FIELD = "cr_phylop";
    public static final String CONSERVED_REGION_PHASTCONS_FIELD = "cr_phastcons";
    public static final String CONSERVED_REGION_GERP_FIELD = "cr_gerp";

    public static final String GENE_TRAIT_FIELD = "gn_trait";
    public static final String GENE_TRAIT_ID_FIELD = "id";
    public static final String GENE_TRAIT_NAME_FIELD = "name";
    public static final String GENE_TRAIT_HPO_FIELD = "hpo";
    public static final String GENE_TRAIT_SCORE_FIELD = "sc";
    //    public static final String GENE_TRAIT_PUBMEDS_FIELD = "nPubmed";
    public static final String GENE_TRAIT_TYPES_FIELD = "types";
    //    public static final String GENE_TRAIT_SOURCES_FIELD = "srcs";
    public static final String GENE_TRAIT_SOURCE_FIELD = "src";

    public static final String DRUG_FIELD = "drug";
    public static final String DRUG_NAME_FIELD = "dn";
    public static final String DRUG_GENE_FIELD = CT_GENE_NAME_FIELD;
    public static final String DRUG_SOURCE_FIELD = "src";
    public static final String DRUG_STUDY_TYPE_FIELD = "st";

    public static final String SCORE_SCORE_FIELD = "sc";
    public static final String SCORE_SOURCE_FIELD = "src";
    public static final String SCORE_DESCRIPTION_FIELD = "desc";

    public static final String CLINICAL_DATA_FIELD = "clinical";
    public static final String CLINICAL_COSMIC_FIELD = "cosmic";
    public static final String CLINICAL_GWAS_FIELD = "gwas";
    public static final String CLINICAL_CLINVAR_FIELD = "clinvar";

    public static final String FUNCTIONAL_SCORE = "fn_score";
    public static final String FUNCTIONAL_CADD_RAW_FIELD = "fn_cadd_r";
    public static final String FUNCTIONAL_CADD_SCALED_FIELD = "fn_cadd_s";

    public static final String DEFAULT_STRAND_VALUE = "+";
    public static final String DEFAULT_DRUB_SOURCE = "dgidb";

    public static final Map<String, String> SCORE_FIELD_MAP;

    private final ObjectMapper jsonObjectMapper;
    private final ObjectWriter writer;

    protected static Logger logger = LoggerFactory.getLogger(DocumentToVariantAnnotationConverter.class);

    public static final String POLYPHEN = "polyphen";
    public static final String SIFT = "sift";

    public static final String PHAST_CONS = "phastCons";
    public static final String PHYLOP = "phylop";
    public static final String GERP = "gerp";

    public static final String CADD_SCALED = "cadd_scaled";
    public static final String CADD_RAW = "cadd_raw";

    static {
        Map<String, String> scoreFieldMap = new HashMap<>(7);
        scoreFieldMap.put(SIFT, ANNOTATION_FIELD + "." + CONSEQUENCE_TYPE_FIELD + "." + CT_PROTEIN_SIFT_FIELD);
        scoreFieldMap.put(POLYPHEN,
                ANNOTATION_FIELD + "." + CONSEQUENCE_TYPE_FIELD + "." + CT_PROTEIN_POLYPHEN_FIELD);
        scoreFieldMap.put(PHAST_CONS, ANNOTATION_FIELD + "." + CONSERVED_REGION_PHASTCONS_FIELD);
        scoreFieldMap.put(PHYLOP, ANNOTATION_FIELD + "." + CONSERVED_REGION_PHYLOP_FIELD);
        scoreFieldMap.put(GERP, ANNOTATION_FIELD + "." + CONSERVED_REGION_GERP_FIELD);
        scoreFieldMap.put(CADD_SCALED, ANNOTATION_FIELD + "." + FUNCTIONAL_CADD_SCALED_FIELD);
        scoreFieldMap.put(CADD_RAW, ANNOTATION_FIELD + "." + FUNCTIONAL_CADD_RAW_FIELD);
        SCORE_FIELD_MAP = Collections.unmodifiableMap(scoreFieldMap);
    }

    public DocumentToVariantAnnotationConverter() {
        jsonObjectMapper = new ObjectMapper();
        jsonObjectMapper.configure(SerializationFeature.WRITE_NULL_MAP_VALUES, false);
        jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL);
        writer = jsonObjectMapper.writer();
    }

    @Override
    public VariantAnnotation convertToDataModelType(Document object) {
        return convertToDataModelType(object, null);
    }

    public VariantAnnotation convertToDataModelType(Document object, Document customAnnotation) {
        VariantAnnotation va = new VariantAnnotation();

        //ConsequenceType
        List<ConsequenceType> consequenceTypes = new LinkedList<>();
        Object cts = object.get(CONSEQUENCE_TYPE_FIELD);
        if (cts != null && cts instanceof List) {
            for (Object o : ((List) cts)) {
                if (o instanceof Document) {
                    Document ct = (Document) o;

                    //SO accession name
                    List<String> soAccessionNames = new LinkedList<>();
                    if (ct.containsKey(CT_SO_ACCESSION_FIELD)) {
                        if (ct.get(CT_SO_ACCESSION_FIELD) instanceof List) {
                            List<Integer> list = (List) ct.get(CT_SO_ACCESSION_FIELD);
                            for (Integer so : list) {
                                soAccessionNames.add(ConsequenceTypeMappings.accessionToTerm.get(so));
                            }
                        } else {
                            soAccessionNames.add(
                                    ConsequenceTypeMappings.accessionToTerm.get(ct.get(CT_SO_ACCESSION_FIELD)));
                        }
                    }

                    //ProteinSubstitutionScores
                    List<Score> proteinSubstitutionScores = new LinkedList<>();
                    if (ct.containsKey(CT_PROTEIN_SUBSTITUTION_SCORE_FIELD)) {
                        List<Document> list = (List) ct.get(CT_PROTEIN_SUBSTITUTION_SCORE_FIELD);
                        for (Document document : list) {
                            proteinSubstitutionScores.add(buildScore(document));
                        }
                    }
                    addScore(ct, proteinSubstitutionScores, POLYPHEN, CT_PROTEIN_POLYPHEN_FIELD);
                    addScore(ct, proteinSubstitutionScores, SIFT, CT_PROTEIN_SIFT_FIELD);

                    List<ProteinFeature> features = new ArrayList<>();
                    if (ct.containsKey(CT_PROTEIN_FEATURE_FIELD)) {
                        List<Document> featureDocuments = (List) ct.get(CT_PROTEIN_FEATURE_FIELD);
                        for (Document featureDocument : featureDocuments) {
                            features.add(
                                    new ProteinFeature(getDefault(featureDocument, CT_PROTEIN_FEATURE_ID_FIELD, ""),
                                            getDefault(featureDocument, CT_PROTEIN_FEATURE_START_FIELD, 0),
                                            getDefault(featureDocument, CT_PROTEIN_FEATURE_END_FIELD, 0),
                                            getDefault(featureDocument, CT_PROTEIN_FEATURE_TYPE_FIELD, ""),
                                            getDefault(featureDocument, CT_PROTEIN_FEATURE_DESCRIPTION_FIELD, "")));
                        }
                    }

                    ProteinVariantAnnotation proteinVariantAnnotation = buildProteinVariantAnnotation(
                            getDefault(ct, CT_PROTEIN_UNIPROT_ACCESSION, null),
                            getDefault(ct, CT_PROTEIN_UNIPROT_NAME, null), getDefault(ct, CT_AA_POSITION_FIELD, 0),
                            getDefault(ct, CT_AA_REFERENCE_FIELD, ""), getDefault(ct, CT_AA_ALTERNATE_FIELD, ""),
                            getDefault(ct, CT_PROTEIN_UNIPROT_VARIANT_ID, null), proteinSubstitutionScores,
                            getDefault(ct, CT_PROTEIN_KEYWORDS, Collections.emptyList()), features);
                    consequenceTypes.add(buildConsequenceType(getDefault(ct, CT_GENE_NAME_FIELD, ""),
                            getDefault(ct, CT_ENSEMBL_GENE_ID_FIELD, ""),
                            getDefault(ct, CT_ENSEMBL_TRANSCRIPT_ID_FIELD, ""),
                            getDefault(ct, CT_STRAND_FIELD, "+"), getDefault(ct, CT_BIOTYPE_FIELD, ""),
                            getDefault(ct, CT_EXON_NUMBER_FIELD, 0),
                            getDefault(ct, CT_TRANSCRIPT_ANNOT_FLAGS, Collections.emptyList()),
                            getDefault(ct, CT_C_DNA_POSITION_FIELD, 0), getDefault(ct, CT_CDS_POSITION_FIELD, 0),
                            getDefault(ct, CT_CODON_FIELD, ""), soAccessionNames, proteinVariantAnnotation));
                }
            }

        }
        va.setConsequenceTypes(consequenceTypes);

        //Conserved Region Scores
        List<Score> conservedRegionScores = new LinkedList<>();
        if (object.containsKey(CONSERVED_REGION_SCORE_FIELD)) {
            List<Document> list = (List) object.get(CONSERVED_REGION_SCORE_FIELD);
            for (Document dbObject : list) {
                conservedRegionScores.add(buildScore(dbObject));
            }
        }
        addScore(object, conservedRegionScores, PHAST_CONS, CONSERVED_REGION_PHASTCONS_FIELD);
        addScore(object, conservedRegionScores, PHYLOP, CONSERVED_REGION_PHYLOP_FIELD);
        addScore(object, conservedRegionScores, GERP, CONSERVED_REGION_GERP_FIELD);
        va.setConservation(conservedRegionScores);

        //Population frequencies
        List<PopulationFrequency> populationFrequencies = new LinkedList<>();
        if (object.containsKey(POPULATION_FREQUENCIES_FIELD)) {
            List<Document> list = (List) object.get(POPULATION_FREQUENCIES_FIELD);
            for (Document dbObject : list) {
                populationFrequencies
                        .add(new PopulationFrequency(getDefault(dbObject, POPULATION_FREQUENCY_STUDY_FIELD, ""),
                                getDefault(dbObject, POPULATION_FREQUENCY_POP_FIELD, ""),
                                getDefault(dbObject, POPULATION_FREQUENCY_REFERENCE_ALLELE_FIELD, ""),
                                getDefault(dbObject, POPULATION_FREQUENCY_ALTERNATE_ALLELE_FIELD, ""),
                                (float) getDefault(dbObject, POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD, -1.0),
                                (float) getDefault(dbObject, POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD, -1.0),
                                -1.0f, -1.0f, -1.0f));
            }
        }
        va.setPopulationFrequencies(populationFrequencies);

        // Gene trait association
        List<GeneTraitAssociation> geneTraitAssociations = new LinkedList<>();
        if (object.containsKey(GENE_TRAIT_FIELD)) {
            List<Document> list = (List) object.get(GENE_TRAIT_FIELD);
            for (Document document : list) {
                geneTraitAssociations.add(new GeneTraitAssociation(getDefault(document, GENE_TRAIT_ID_FIELD, ""),
                        getDefault(document, GENE_TRAIT_NAME_FIELD, ""),
                        getDefault(document, GENE_TRAIT_HPO_FIELD, ""),
                        (float) getDefault(document, GENE_TRAIT_SCORE_FIELD, 0F), 0, //getDefault(document, GENE_TRAIT_PUBMEDS_FIELD, 0),
                        getDefault(document, GENE_TRAIT_TYPES_FIELD, Collections.emptyList()),
                        Collections.emptyList(), getDefault(document, GENE_TRAIT_SOURCE_FIELD, "")));
            }
        }
        va.setGeneTraitAssociation(geneTraitAssociations);

        // Drug-Gene Interactions
        List<GeneDrugInteraction> drugs = new LinkedList<>();
        if (object.containsKey(DRUG_FIELD)) {
            List<Document> list = (List) object.get(DRUG_FIELD);
            for (Document dbObject : list) {
                //drugs.add(dbObject.toMap());
                drugs.add(new GeneDrugInteraction(getDefault(dbObject, DRUG_GENE_FIELD, ""),
                        getDefault(dbObject, DRUG_NAME_FIELD, ""),
                        getDefault(dbObject, DRUG_SOURCE_FIELD, DEFAULT_DRUB_SOURCE),
                        getDefault(dbObject, DRUG_STUDY_TYPE_FIELD, ""), "")); // "convertToStorageType" stores the study_type within the
                // DRUG_SOURCE_FIELD

            }
        }
        va.setGeneDrugInteraction(drugs);

        //XREfs
        Object xrs = object.get(XREFS_FIELD);
        if (xrs != null && xrs instanceof List) {
            List<Xref> xrefs = new LinkedList<>();
            for (Object o : (List) xrs) {
                if (o instanceof Document) {
                    Document xref = (Document) o;

                    xrefs.add(new Xref((String) xref.get(XREF_ID_FIELD), (String) xref.get(XREF_SOURCE_FIELD)));
                }
            }
            va.setXrefs(xrefs);
        }

        //Functional score
        List<Score> functionalScore = new LinkedList<>();
        if (object.containsKey(FUNCTIONAL_SCORE)) {
            List<Document> scores = object.get(FUNCTIONAL_SCORE, List.class);
            for (Document document : scores) {
                functionalScore.add(buildScore(document));
            }
        }
        addScore(object, functionalScore, CADD_SCALED, FUNCTIONAL_CADD_SCALED_FIELD);
        addScore(object, functionalScore, CADD_RAW, FUNCTIONAL_CADD_RAW_FIELD);
        va.setFunctionalScore(functionalScore);

        //Clinical Data
        if (object.containsKey(CLINICAL_DATA_FIELD)) {
            va.setVariantTraitAssociation(parseClinicalData((Document) object.get(CLINICAL_DATA_FIELD)));
        }

        if (customAnnotation != null) {
            Map<String, AdditionalAttribute> additionalAttributes = convertAdditionalAttributesToDataModelType(
                    customAnnotation);
            va.setAdditionalAttributes(additionalAttributes);
        }

        return va;
    }

    public void addScore(Document object, List<Score> functionalScore, String source, String key) {
        if (object.containsKey(key)) {
            Document document = (Document) object.get(key);
            functionalScore.add(buildScore(source, document));
        }
    }

    private Score buildScore(Document document) {
        return buildScore("", document);
    }

    private Score buildScore(String source, Document document) {
        return new Score(getDefault(document, SCORE_SCORE_FIELD, 0.0),
                getDefault(document, SCORE_SOURCE_FIELD, source),
                getDefault(document, SCORE_DESCRIPTION_FIELD, ""));
    }

    private ConsequenceType buildConsequenceType(String geneName, String ensemblGeneId, String ensemblTranscriptId,
            String strand, String biotype, Integer exonNumber, List<String> transcriptAnnotationFlags,
            Integer cDnaPosition, Integer cdsPosition, String codon, List<String> soNameList,
            ProteinVariantAnnotation proteinVariantAnnotation) {
        List<SequenceOntologyTerm> soTerms = new ArrayList<>(soNameList.size());
        for (String soName : soNameList) {
            soTerms.add(new SequenceOntologyTerm(ConsequenceTypeMappings.getSoAccessionString(soName), soName));
        }
        return new ConsequenceType(geneName, ensemblGeneId, ensemblTranscriptId, strand, biotype, exonNumber,
                transcriptAnnotationFlags, cDnaPosition, cdsPosition, codon, proteinVariantAnnotation, soTerms);
    }

    private ProteinVariantAnnotation buildProteinVariantAnnotation(String uniprotAccession, String uniprotName,
            int aaPosition, String aaReference, String aaAlternate, String uniprotVariantId,
            List<Score> proteinSubstitutionScores, List<String> keywords, List<ProteinFeature> features) {
        if (areAllEmpty(uniprotAccession, uniprotName, aaPosition, aaReference, aaAlternate, uniprotVariantId,
                proteinSubstitutionScores, keywords, features)) {
            return null;
        } else {
            return new ProteinVariantAnnotation(uniprotAccession, uniprotName, aaPosition, aaReference, aaAlternate,
                    uniprotVariantId, null, proteinSubstitutionScores, keywords, features);
        }
    }

    private VariantTraitAssociation parseClinicalData(Document clinicalData) {
        if (clinicalData != null) {
            int size = 0;
            VariantTraitAssociation variantTraitAssociation = new VariantTraitAssociation();
            List cosmicDBList = (List) clinicalData.get(CLINICAL_COSMIC_FIELD);
            if (cosmicDBList != null) {
                List<Cosmic> cosmicList = new ArrayList<>(cosmicDBList.size());
                for (Object object : cosmicDBList) {
                    cosmicList.add(jsonObjectMapper.convertValue(object, Cosmic.class));
                }
                size += cosmicList.size();
                variantTraitAssociation.setCosmic(cosmicList);
            }
            List gwasDBList = (List) clinicalData.get(CLINICAL_GWAS_FIELD);
            if (gwasDBList != null) {
                List<Gwas> gwasList = new ArrayList<>(gwasDBList.size());
                for (Object object : gwasDBList) {
                    gwasList.add(jsonObjectMapper.convertValue(object, Gwas.class));
                }
                size += gwasList.size();
                variantTraitAssociation.setGwas(gwasList);
            }
            List clinvarDBList = (List) clinicalData.get(CLINICAL_CLINVAR_FIELD);
            if (clinvarDBList != null) {
                List<ClinVar> clinvarList = new ArrayList<>(clinvarDBList.size());
                for (Object object : clinvarDBList) {
                    clinvarList.add(jsonObjectMapper.convertValue(object, ClinVar.class));
                }
                size += clinvarList.size();
                variantTraitAssociation.setClinvar(clinvarList);
            }
            if (size > 0) {
                return variantTraitAssociation;
            } else {
                return null;
            }
        }

        return null;

    }

    public Map<String, AdditionalAttribute> convertAdditionalAttributesToDataModelType(Document customAnnotation) {
        Map<String, AdditionalAttribute> attributeMap = new HashMap<>();
        for (String key : customAnnotation.keySet()) {
            Document document = customAnnotation.get(key, Document.class);
            HashMap<String, String> map = new HashMap<>();
            document.forEach((k, value) -> map.put(k, value.toString()));
            AdditionalAttribute attribute = new AdditionalAttribute(map);
            attributeMap.put(key, attribute);
        }
        return attributeMap;
    }

    @Override
    public Document convertToStorageType(VariantAnnotation variantAnnotation) {
        Document document = new Document();
        Set<Document> xrefs = new HashSet<>();
        List<Document> cts = new LinkedList<>();

        //Annotation ID
        document.put(ANNOT_ID_FIELD, "?");

        //Variant ID
        if (variantAnnotation.getId() != null && !variantAnnotation.getId().isEmpty()) {
            xrefs.add(convertXrefToStorage(variantAnnotation.getId(), "dbSNP"));
        }

        //ConsequenceType
        if (variantAnnotation.getConsequenceTypes() != null) {
            Set<String> gnSo = new HashSet<>();
            List<ConsequenceType> consequenceTypes = variantAnnotation.getConsequenceTypes();
            for (ConsequenceType consequenceType : consequenceTypes) {
                Document ct = new Document();

                putNotNull(ct, CT_GENE_NAME_FIELD, consequenceType.getGeneName());
                putNotNull(ct, CT_ENSEMBL_GENE_ID_FIELD, consequenceType.getEnsemblGeneId());
                putNotNull(ct, CT_ENSEMBL_TRANSCRIPT_ID_FIELD, consequenceType.getEnsemblTranscriptId());
                //                putNotNull(ct, RELATIVE_POS_FIELD, consequenceType.getRelativePosition());
                putNotNull(ct, CT_CODON_FIELD, consequenceType.getCodon());
                putNotDefault(ct, CT_STRAND_FIELD, consequenceType.getStrand(), DEFAULT_STRAND_VALUE);
                putNotNull(ct, CT_BIOTYPE_FIELD, consequenceType.getBiotype());
                putNotNull(ct, CT_EXON_NUMBER_FIELD, consequenceType.getExonNumber());
                putNotNull(ct, CT_TRANSCRIPT_ANNOT_FLAGS, consequenceType.getTranscriptAnnotationFlags());
                putNotNull(ct, CT_C_DNA_POSITION_FIELD, consequenceType.getCdnaPosition());
                putNotNull(ct, CT_CDS_POSITION_FIELD, consequenceType.getCdsPosition());

                if (consequenceType.getSequenceOntologyTerms() != null) {
                    List<Integer> soAccession = new LinkedList<>();
                    for (SequenceOntologyTerm entry : consequenceType.getSequenceOntologyTerms()) {
                        soAccession.add(ConsequenceTypeMappings.termToAccession.get(entry.getName()));
                    }
                    putNotNull(ct, CT_SO_ACCESSION_FIELD, soAccession);

                    for (Integer so : soAccession) {
                        if (StringUtils.isNotEmpty(consequenceType.getGeneName())) {
                            gnSo.add(buildGeneSO(consequenceType.getGeneName(), so));
                        }
                        if (StringUtils.isNotEmpty(consequenceType.getEnsemblGeneId())) {
                            gnSo.add(buildGeneSO(consequenceType.getEnsemblGeneId(), so));
                        }
                        if (StringUtils.isNotEmpty(consequenceType.getEnsemblTranscriptId())) {
                            gnSo.add(buildGeneSO(consequenceType.getEnsemblTranscriptId(), so));
                        }
                        if (consequenceType.getProteinVariantAnnotation() != null) {
                            if (StringUtils.isNotEmpty(
                                    consequenceType.getProteinVariantAnnotation().getUniprotAccession())) {
                                gnSo.add(buildGeneSO(
                                        consequenceType.getProteinVariantAnnotation().getUniprotAccession(), so));
                            }
                            if (StringUtils
                                    .isNotEmpty(consequenceType.getProteinVariantAnnotation().getUniprotName())) {
                                gnSo.add(buildGeneSO(consequenceType.getProteinVariantAnnotation().getUniprotName(),
                                        so));
                            }
                        }
                    }
                }
                //Protein annotation
                if (consequenceType.getProteinVariantAnnotation() != null) {
                    putNotNull(ct, CT_AA_POSITION_FIELD,
                            consequenceType.getProteinVariantAnnotation().getPosition());
                    putNotNull(ct, CT_AA_REFERENCE_FIELD,
                            consequenceType.getProteinVariantAnnotation().getReference());
                    putNotNull(ct, CT_AA_ALTERNATE_FIELD,
                            consequenceType.getProteinVariantAnnotation().getAlternate());
                    putNotNull(ct, CT_PROTEIN_UNIPROT_ACCESSION,
                            consequenceType.getProteinVariantAnnotation().getUniprotAccession());
                    putNotNull(ct, CT_PROTEIN_UNIPROT_NAME,
                            consequenceType.getProteinVariantAnnotation().getUniprotName());
                    putNotNull(ct, CT_PROTEIN_UNIPROT_VARIANT_ID,
                            consequenceType.getProteinVariantAnnotation().getUniprotVariantId());
                    //Protein substitution region score
                    if (consequenceType.getProteinVariantAnnotation().getSubstitutionScores() != null) {
                        List<Document> proteinSubstitutionScores = new LinkedList<>();
                        for (Score score : consequenceType.getProteinVariantAnnotation().getSubstitutionScores()) {
                            if (score != null) {
                                if (score.getSource().equals(POLYPHEN)) {
                                    putNotNull(ct, CT_PROTEIN_POLYPHEN_FIELD, convertScoreToStorageNoSource(score));
                                } else if (score.getSource().equals(SIFT)) {
                                    putNotNull(ct, CT_PROTEIN_SIFT_FIELD, convertScoreToStorageNoSource(score));
                                } else {
                                    proteinSubstitutionScores.add(convertScoreToStorage(score));
                                }
                            }
                        }
                        putNotNull(ct, CT_PROTEIN_SUBSTITUTION_SCORE_FIELD, proteinSubstitutionScores);
                    }
                    putNotNull(ct, CT_PROTEIN_KEYWORDS,
                            consequenceType.getProteinVariantAnnotation().getKeywords());

                    List<ProteinFeature> features = consequenceType.getProteinVariantAnnotation().getFeatures();
                    if (features != null) {
                        List<Document> documentFeatures = new ArrayList<>(features.size());
                        for (ProteinFeature feature : features) {
                            Document documentFeature = new Document();
                            putNotNull(documentFeature, CT_PROTEIN_FEATURE_ID_FIELD, feature.getId());
                            putNotNull(documentFeature, CT_PROTEIN_FEATURE_START_FIELD, feature.getStart());
                            putNotNull(documentFeature, CT_PROTEIN_FEATURE_END_FIELD, feature.getEnd());
                            putNotNull(documentFeature, CT_PROTEIN_FEATURE_TYPE_FIELD, feature.getType());
                            putNotNull(documentFeature, CT_PROTEIN_FEATURE_DESCRIPTION_FIELD,
                                    feature.getDescription());
                            documentFeatures.add(documentFeature);
                        }
                        putNotNull(ct, CT_PROTEIN_FEATURE_FIELD, documentFeatures);
                    }

                    if (StringUtils
                            .isNotEmpty(consequenceType.getProteinVariantAnnotation().getUniprotAccession())) {
                        xrefs.add(convertXrefToStorage(
                                consequenceType.getProteinVariantAnnotation().getUniprotAccession(), "UniProt"));
                    }
                    if (StringUtils.isNotEmpty(consequenceType.getProteinVariantAnnotation().getUniprotName())) {
                        xrefs.add(convertXrefToStorage(
                                consequenceType.getProteinVariantAnnotation().getUniprotName(), "UniProt"));
                    }
                    if (StringUtils
                            .isNotEmpty(consequenceType.getProteinVariantAnnotation().getUniprotVariantId())) {
                        xrefs.add(convertXrefToStorage(
                                consequenceType.getProteinVariantAnnotation().getUniprotVariantId(), "UniProt"));
                    }
                }

                cts.add(ct);

                if (StringUtils.isNotEmpty(consequenceType.getGeneName())) {
                    xrefs.add(convertXrefToStorage(consequenceType.getGeneName(), "HGNC"));
                }
                if (StringUtils.isNotEmpty(consequenceType.getEnsemblGeneId())) {
                    xrefs.add(convertXrefToStorage(consequenceType.getEnsemblGeneId(), "ensemblGene"));
                }
                if (StringUtils.isNotEmpty(consequenceType.getEnsemblTranscriptId())) {
                    xrefs.add(convertXrefToStorage(consequenceType.getEnsemblTranscriptId(), "ensemblTranscript"));
                }

            }
            putNotNull(document, GENE_SO_FIELD, gnSo);
            putNotNull(document, CONSEQUENCE_TYPE_FIELD, cts);
        }

        //Conserved region score
        if (variantAnnotation.getConservation() != null) {
            List<Document> conservedRegionScores = new LinkedList<>();
            for (Score score : variantAnnotation.getConservation()) {
                if (score != null) {
                    if (score.getSource().equals(PHYLOP)) {
                        putNotNull(document, CONSERVED_REGION_PHYLOP_FIELD, convertScoreToStorageNoSource(score));
                    } else if (score.getSource().equals(PHAST_CONS)) {
                        putNotNull(document, CONSERVED_REGION_PHASTCONS_FIELD,
                                convertScoreToStorageNoSource(score));
                    } else if (score.getSource().equals(GERP)) {
                        putNotNull(document, CONSERVED_REGION_GERP_FIELD, convertScoreToStorageNoSource(score));
                    } else {
                        conservedRegionScores.add(convertScoreToStorage(score));
                    }
                }
            }
            putNotNull(document, CONSERVED_REGION_SCORE_FIELD, conservedRegionScores);
        }

        // Gene trait association
        if (variantAnnotation.getGeneTraitAssociation() != null) {
            List<Document> geneTraitAssociations = new LinkedList<>();
            for (GeneTraitAssociation geneTraitAssociation : variantAnnotation.getGeneTraitAssociation()) {
                if (geneTraitAssociation != null) {
                    Document d = new Document();
                    putNotNull(d, GENE_TRAIT_ID_FIELD, geneTraitAssociation.getId());
                    putNotNull(d, GENE_TRAIT_NAME_FIELD, geneTraitAssociation.getName());
                    putNotNull(d, GENE_TRAIT_SCORE_FIELD, geneTraitAssociation.getScore());
                    putNotNull(d, GENE_TRAIT_HPO_FIELD, geneTraitAssociation.getHpo());
                    if (StringUtils.isNotEmpty(geneTraitAssociation.getHpo())) {
                        xrefs.add(convertXrefToStorage(geneTraitAssociation.getHpo(), "HPO"));
                    }
                    //                    putNotNull(d, GENE_TRAIT_PUBMEDS_FIELD, geneTraitAssociation.getNumberOfPubmeds());
                    putNotNull(d, GENE_TRAIT_TYPES_FIELD, geneTraitAssociation.getAssociationTypes());
                    //                    putNotNull(d, GENE_TRAIT_SOURCES_FIELD, geneTraitAssociation.getSources());
                    putNotNull(d, GENE_TRAIT_SOURCE_FIELD, geneTraitAssociation.getSource());

                    geneTraitAssociations.add(d);
                }
            }
            putNotNull(document, GENE_TRAIT_FIELD, geneTraitAssociations);
        }

        //Population frequencies
        if (variantAnnotation.getPopulationFrequencies() != null) {
            List<Document> populationFrequencies = new LinkedList<>();
            for (PopulationFrequency populationFrequency : variantAnnotation.getPopulationFrequencies()) {
                if (populationFrequency != null) {
                    populationFrequencies.add(convertPopulationFrequencyToStorage(populationFrequency));
                }
            }
            putNotNull(document, POPULATION_FREQUENCIES_FIELD, populationFrequencies);
        }

        // Drug-Gene Interactions
        if (variantAnnotation.getGeneDrugInteraction() != null) {
            List<Document> drugGeneInteractions = new LinkedList<>();
            List<GeneDrugInteraction> geneDrugInteractionList = variantAnnotation.getGeneDrugInteraction();
            if (geneDrugInteractionList != null) {
                for (GeneDrugInteraction geneDrugInteraction : geneDrugInteractionList) {
                    Document drugDbObject = new Document(DRUG_GENE_FIELD, geneDrugInteraction.getGeneName());
                    putNotNull(drugDbObject, DRUG_NAME_FIELD, geneDrugInteraction.getDrugName());
                    putNotDefault(drugDbObject, DRUG_SOURCE_FIELD, geneDrugInteraction.getSource(),
                            DEFAULT_DRUB_SOURCE);
                    putNotNull(drugDbObject, DRUG_STUDY_TYPE_FIELD, geneDrugInteraction.getStudyType());
                    drugGeneInteractions.add(drugDbObject);
                }
            }
            putNotNull(document, DRUG_FIELD, drugGeneInteractions);
        }

        //XREFs
        if (variantAnnotation.getXrefs() != null) {
            for (Xref xref : variantAnnotation.getXrefs()) {
                xrefs.add(convertXrefToStorage(xref.getId(), xref.getSource()));
            }
        }
        putNotNull(document, XREFS_FIELD, xrefs);

        //Functional score
        if (variantAnnotation.getFunctionalScore() != null) {
            List<Document> scores = new ArrayList<>(variantAnnotation.getFunctionalScore().size());
            for (Score score : variantAnnotation.getFunctionalScore()) {
                if (score != null) {
                    if (score.getSource().equals(CADD_RAW)) {
                        putNotNull(document, FUNCTIONAL_CADD_RAW_FIELD, convertScoreToStorageNoSource(score));
                    } else if (score.getSource().equals(CADD_SCALED)) {
                        putNotNull(document, FUNCTIONAL_CADD_SCALED_FIELD, convertScoreToStorageNoSource(score));
                    } else {
                        scores.add(convertScoreToStorage(score));
                    }
                }
            }
            putNotNull(document, FUNCTIONAL_SCORE, scores);
        }

        //Clinical Data
        Document clinicalDocument = new Document();
        if (variantAnnotation.getVariantTraitAssociation() != null) {
            putNotNull(clinicalDocument, CLINICAL_COSMIC_FIELD,
                    generateClinicalDBList(variantAnnotation.getVariantTraitAssociation().getCosmic()));
            if (variantAnnotation.getVariantTraitAssociation().getCosmic() != null) {
                variantAnnotation.getVariantTraitAssociation().getCosmic().stream().map(Cosmic::getMutationId)
                        .filter(StringUtils::isNotEmpty)
                        .forEach(mutationId -> xrefs.add(convertXrefToStorage(mutationId, "COSMIC")));
            }
            putNotNull(clinicalDocument, CLINICAL_GWAS_FIELD,
                    generateClinicalDBList(variantAnnotation.getVariantTraitAssociation().getGwas()));
            putNotNull(clinicalDocument, CLINICAL_CLINVAR_FIELD,
                    generateClinicalDBList(variantAnnotation.getVariantTraitAssociation().getClinvar()));
            if (variantAnnotation.getVariantTraitAssociation().getClinvar() != null) {
                variantAnnotation.getVariantTraitAssociation().getClinvar().stream().map(ClinVar::getAccession)
                        .filter(StringUtils::isNotEmpty)
                        .forEach(accession -> xrefs.add(convertXrefToStorage(accession, "ClinVar")));
            }
        }
        if (!clinicalDocument.isEmpty()) {
            document.put(CLINICAL_DATA_FIELD, clinicalDocument);
        }

        return document;
    }

    public static String buildGeneSO(String gene, Integer so) {
        return gene == null ? null : gene + '_' + so;
    }

    private <T> List<Document> generateClinicalDBList(List<T> objectList) {
        if (objectList != null) {
            List<Document> list = new ArrayList<>(objectList.size());
            for (T object : objectList) {
                try {
                    if (object instanceof GenericRecord) {
                        list.add(Document.parse(object.toString()));
                    } else {
                        list.add(Document.parse(writer.writeValueAsString(object)));
                    }
                } catch (JsonProcessingException e) {
                    e.printStackTrace();
                    logger.error("Error serializing Clinical Data " + object.getClass(), e);
                }
            }
            return list;
        }
        return null;
    }

    public Document convertScoreToStorageNoSource(Score score) {
        return convertScoreToStorage(score.getScore(), null, score.getDescription());
    }

    private Document convertScoreToStorage(Score score) {
        return convertScoreToStorage(score.getScore(), score.getSource(), score.getDescription());
    }

    private Document convertScoreToStorage(double score, String source, String description) {
        Document dbObject = new Document(SCORE_SCORE_FIELD, score);
        putNotNull(dbObject, SCORE_SOURCE_FIELD, source);
        putNotNull(dbObject, SCORE_DESCRIPTION_FIELD, description);
        return dbObject;
    }

    private Document convertPopulationFrequencyToStorage(PopulationFrequency populationFrequency) {
        Document dbObject = new Document(POPULATION_FREQUENCY_STUDY_FIELD, populationFrequency.getStudy());
        putNotNull(dbObject, POPULATION_FREQUENCY_POP_FIELD, populationFrequency.getPopulation());
        putNotNull(dbObject, POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD,
                populationFrequency.getRefAlleleFreq());
        putNotNull(dbObject, POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD,
                populationFrequency.getAltAlleleFreq());
        return dbObject;
    }

    private Document convertXrefToStorage(String id, String source) {
        Document dbObject = new Document(XREF_ID_FIELD, id);
        dbObject.put(XREF_SOURCE_FIELD, source);
        return dbObject;
    }

    public Document convertToStorageType(Map<String, AdditionalAttribute> attributes) {
        Document document = new Document();
        attributes.forEach((key, attribute) -> {
            document.put(key, convertToStorageType(attribute));
        });
        return document;
    }

    public static Document convertToStorageType(AdditionalAttribute attribute) {
        Document document = new Document();
        document.putAll(attribute.getAttribute());
        return document;
    }

}