Java tutorial
/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.mongodb.variant.converters; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import com.fasterxml.jackson.databind.SerializationFeature; import org.apache.avro.generic.GenericRecord; import org.apache.commons.lang3.StringUtils; import org.bson.Document; import org.opencb.biodata.models.variant.annotation.ConsequenceTypeMappings; import org.opencb.biodata.models.variant.avro.*; import org.opencb.commons.datastore.core.ComplexTypeConverter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; import static org.opencb.opencga.storage.mongodb.variant.converters.DocumentToVariantConverter.ANNOTATION_FIELD; /** * Created by jacobo on 13/01/15. */ public class DocumentToVariantAnnotationConverter extends AbstractDocumentConverter implements ComplexTypeConverter<VariantAnnotation, Document> { public static final String ANNOT_ID_FIELD = "id"; public static final String GENE_SO_FIELD = "_gn_so"; public static final String CONSEQUENCE_TYPE_FIELD = "ct"; public static final String CT_GENE_NAME_FIELD = "gn"; public static final String CT_ENSEMBL_GENE_ID_FIELD = "ensg"; public static final String CT_ENSEMBL_TRANSCRIPT_ID_FIELD = "enst"; public static final String CT_RELATIVE_POS_FIELD = "relPos"; public static final String CT_CODON_FIELD = "codon"; public static final String CT_STRAND_FIELD = "strand"; public static final String CT_BIOTYPE_FIELD = "bt"; public static final String CT_EXON_NUMBER_FIELD = "exn"; public static final String CT_TRANSCRIPT_ANNOT_FLAGS = "flags"; public static final String CT_C_DNA_POSITION_FIELD = "cDnaPos"; public static final String CT_CDS_POSITION_FIELD = "cdsPos"; public static final String CT_AA_POSITION_FIELD = "aaPos"; public static final String CT_AA_REFERENCE_FIELD = "aaRef"; public static final String CT_AA_ALTERNATE_FIELD = "aaAlt"; public static final String CT_SO_ACCESSION_FIELD = "so"; public static final String CT_PROTEIN_KEYWORDS = "kw"; public static final String CT_PROTEIN_SUBSTITUTION_SCORE_FIELD = "ps_score"; public static final String CT_PROTEIN_POLYPHEN_FIELD = "polyphen"; public static final String CT_PROTEIN_SIFT_FIELD = "sift"; public static final String CT_PROTEIN_FEATURE_FIELD = "pd"; public static final String CT_PROTEIN_FEATURE_ID_FIELD = "id"; public static final String CT_PROTEIN_FEATURE_START_FIELD = "start"; public static final String CT_PROTEIN_FEATURE_END_FIELD = "end"; public static final String CT_PROTEIN_FEATURE_TYPE_FIELD = "type"; public static final String CT_PROTEIN_FEATURE_DESCRIPTION_FIELD = "desc"; public static final String CT_PROTEIN_UNIPROT_ACCESSION = "uni_a"; public static final String CT_PROTEIN_UNIPROT_NAME = "uni_n"; public static final String CT_PROTEIN_UNIPROT_VARIANT_ID = "uni_var"; public static final String XREFS_FIELD = "xrefs"; public static final String XREF_ID_FIELD = "id"; public static final String XREF_SOURCE_FIELD = "src"; public static final String POPULATION_FREQUENCIES_FIELD = "popFq"; public static final String POPULATION_FREQUENCY_STUDY_FIELD = "study"; public static final String POPULATION_FREQUENCY_POP_FIELD = "pop"; public static final String POPULATION_FREQUENCY_REFERENCE_ALLELE_FIELD = "ref"; public static final String POPULATION_FREQUENCY_ALTERNATE_ALLELE_FIELD = "alt"; public static final String POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD = "refFq"; public static final String POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD = "altFq"; public static final String CONSERVED_REGION_SCORE_FIELD = "cr_score"; public static final String CONSERVED_REGION_PHYLOP_FIELD = "cr_phylop"; public static final String CONSERVED_REGION_PHASTCONS_FIELD = "cr_phastcons"; public static final String CONSERVED_REGION_GERP_FIELD = "cr_gerp"; public static final String GENE_TRAIT_FIELD = "gn_trait"; public static final String GENE_TRAIT_ID_FIELD = "id"; public static final String GENE_TRAIT_NAME_FIELD = "name"; public static final String GENE_TRAIT_HPO_FIELD = "hpo"; public static final String GENE_TRAIT_SCORE_FIELD = "sc"; // public static final String GENE_TRAIT_PUBMEDS_FIELD = "nPubmed"; public static final String GENE_TRAIT_TYPES_FIELD = "types"; // public static final String GENE_TRAIT_SOURCES_FIELD = "srcs"; public static final String GENE_TRAIT_SOURCE_FIELD = "src"; public static final String DRUG_FIELD = "drug"; public static final String DRUG_NAME_FIELD = "dn"; public static final String DRUG_GENE_FIELD = CT_GENE_NAME_FIELD; public static final String DRUG_SOURCE_FIELD = "src"; public static final String DRUG_STUDY_TYPE_FIELD = "st"; public static final String SCORE_SCORE_FIELD = "sc"; public static final String SCORE_SOURCE_FIELD = "src"; public static final String SCORE_DESCRIPTION_FIELD = "desc"; public static final String CLINICAL_DATA_FIELD = "clinical"; public static final String CLINICAL_COSMIC_FIELD = "cosmic"; public static final String CLINICAL_GWAS_FIELD = "gwas"; public static final String CLINICAL_CLINVAR_FIELD = "clinvar"; public static final String FUNCTIONAL_SCORE = "fn_score"; public static final String FUNCTIONAL_CADD_RAW_FIELD = "fn_cadd_r"; public static final String FUNCTIONAL_CADD_SCALED_FIELD = "fn_cadd_s"; public static final String DEFAULT_STRAND_VALUE = "+"; public static final String DEFAULT_DRUB_SOURCE = "dgidb"; public static final Map<String, String> SCORE_FIELD_MAP; private final ObjectMapper jsonObjectMapper; private final ObjectWriter writer; protected static Logger logger = LoggerFactory.getLogger(DocumentToVariantAnnotationConverter.class); public static final String POLYPHEN = "polyphen"; public static final String SIFT = "sift"; public static final String PHAST_CONS = "phastCons"; public static final String PHYLOP = "phylop"; public static final String GERP = "gerp"; public static final String CADD_SCALED = "cadd_scaled"; public static final String CADD_RAW = "cadd_raw"; static { Map<String, String> scoreFieldMap = new HashMap<>(7); scoreFieldMap.put(SIFT, ANNOTATION_FIELD + "." + CONSEQUENCE_TYPE_FIELD + "." + CT_PROTEIN_SIFT_FIELD); scoreFieldMap.put(POLYPHEN, ANNOTATION_FIELD + "." + CONSEQUENCE_TYPE_FIELD + "." + CT_PROTEIN_POLYPHEN_FIELD); scoreFieldMap.put(PHAST_CONS, ANNOTATION_FIELD + "." + CONSERVED_REGION_PHASTCONS_FIELD); scoreFieldMap.put(PHYLOP, ANNOTATION_FIELD + "." + CONSERVED_REGION_PHYLOP_FIELD); scoreFieldMap.put(GERP, ANNOTATION_FIELD + "." + CONSERVED_REGION_GERP_FIELD); scoreFieldMap.put(CADD_SCALED, ANNOTATION_FIELD + "." + FUNCTIONAL_CADD_SCALED_FIELD); scoreFieldMap.put(CADD_RAW, ANNOTATION_FIELD + "." + FUNCTIONAL_CADD_RAW_FIELD); SCORE_FIELD_MAP = Collections.unmodifiableMap(scoreFieldMap); } public DocumentToVariantAnnotationConverter() { jsonObjectMapper = new ObjectMapper(); jsonObjectMapper.configure(SerializationFeature.WRITE_NULL_MAP_VALUES, false); jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); writer = jsonObjectMapper.writer(); } @Override public VariantAnnotation convertToDataModelType(Document object) { return convertToDataModelType(object, null); } public VariantAnnotation convertToDataModelType(Document object, Document customAnnotation) { VariantAnnotation va = new VariantAnnotation(); //ConsequenceType List<ConsequenceType> consequenceTypes = new LinkedList<>(); Object cts = object.get(CONSEQUENCE_TYPE_FIELD); if (cts != null && cts instanceof List) { for (Object o : ((List) cts)) { if (o instanceof Document) { Document ct = (Document) o; //SO accession name List<String> soAccessionNames = new LinkedList<>(); if (ct.containsKey(CT_SO_ACCESSION_FIELD)) { if (ct.get(CT_SO_ACCESSION_FIELD) instanceof List) { List<Integer> list = (List) ct.get(CT_SO_ACCESSION_FIELD); for (Integer so : list) { soAccessionNames.add(ConsequenceTypeMappings.accessionToTerm.get(so)); } } else { soAccessionNames.add( ConsequenceTypeMappings.accessionToTerm.get(ct.get(CT_SO_ACCESSION_FIELD))); } } //ProteinSubstitutionScores List<Score> proteinSubstitutionScores = new LinkedList<>(); if (ct.containsKey(CT_PROTEIN_SUBSTITUTION_SCORE_FIELD)) { List<Document> list = (List) ct.get(CT_PROTEIN_SUBSTITUTION_SCORE_FIELD); for (Document document : list) { proteinSubstitutionScores.add(buildScore(document)); } } addScore(ct, proteinSubstitutionScores, POLYPHEN, CT_PROTEIN_POLYPHEN_FIELD); addScore(ct, proteinSubstitutionScores, SIFT, CT_PROTEIN_SIFT_FIELD); List<ProteinFeature> features = new ArrayList<>(); if (ct.containsKey(CT_PROTEIN_FEATURE_FIELD)) { List<Document> featureDocuments = (List) ct.get(CT_PROTEIN_FEATURE_FIELD); for (Document featureDocument : featureDocuments) { features.add( new ProteinFeature(getDefault(featureDocument, CT_PROTEIN_FEATURE_ID_FIELD, ""), getDefault(featureDocument, CT_PROTEIN_FEATURE_START_FIELD, 0), getDefault(featureDocument, CT_PROTEIN_FEATURE_END_FIELD, 0), getDefault(featureDocument, CT_PROTEIN_FEATURE_TYPE_FIELD, ""), getDefault(featureDocument, CT_PROTEIN_FEATURE_DESCRIPTION_FIELD, ""))); } } ProteinVariantAnnotation proteinVariantAnnotation = buildProteinVariantAnnotation( getDefault(ct, CT_PROTEIN_UNIPROT_ACCESSION, null), getDefault(ct, CT_PROTEIN_UNIPROT_NAME, null), getDefault(ct, CT_AA_POSITION_FIELD, 0), getDefault(ct, CT_AA_REFERENCE_FIELD, ""), getDefault(ct, CT_AA_ALTERNATE_FIELD, ""), getDefault(ct, CT_PROTEIN_UNIPROT_VARIANT_ID, null), proteinSubstitutionScores, getDefault(ct, CT_PROTEIN_KEYWORDS, Collections.emptyList()), features); consequenceTypes.add(buildConsequenceType(getDefault(ct, CT_GENE_NAME_FIELD, ""), getDefault(ct, CT_ENSEMBL_GENE_ID_FIELD, ""), getDefault(ct, CT_ENSEMBL_TRANSCRIPT_ID_FIELD, ""), getDefault(ct, CT_STRAND_FIELD, "+"), getDefault(ct, CT_BIOTYPE_FIELD, ""), getDefault(ct, CT_EXON_NUMBER_FIELD, 0), getDefault(ct, CT_TRANSCRIPT_ANNOT_FLAGS, Collections.emptyList()), getDefault(ct, CT_C_DNA_POSITION_FIELD, 0), getDefault(ct, CT_CDS_POSITION_FIELD, 0), getDefault(ct, CT_CODON_FIELD, ""), soAccessionNames, proteinVariantAnnotation)); } } } va.setConsequenceTypes(consequenceTypes); //Conserved Region Scores List<Score> conservedRegionScores = new LinkedList<>(); if (object.containsKey(CONSERVED_REGION_SCORE_FIELD)) { List<Document> list = (List) object.get(CONSERVED_REGION_SCORE_FIELD); for (Document dbObject : list) { conservedRegionScores.add(buildScore(dbObject)); } } addScore(object, conservedRegionScores, PHAST_CONS, CONSERVED_REGION_PHASTCONS_FIELD); addScore(object, conservedRegionScores, PHYLOP, CONSERVED_REGION_PHYLOP_FIELD); addScore(object, conservedRegionScores, GERP, CONSERVED_REGION_GERP_FIELD); va.setConservation(conservedRegionScores); //Population frequencies List<PopulationFrequency> populationFrequencies = new LinkedList<>(); if (object.containsKey(POPULATION_FREQUENCIES_FIELD)) { List<Document> list = (List) object.get(POPULATION_FREQUENCIES_FIELD); for (Document dbObject : list) { populationFrequencies .add(new PopulationFrequency(getDefault(dbObject, POPULATION_FREQUENCY_STUDY_FIELD, ""), getDefault(dbObject, POPULATION_FREQUENCY_POP_FIELD, ""), getDefault(dbObject, POPULATION_FREQUENCY_REFERENCE_ALLELE_FIELD, ""), getDefault(dbObject, POPULATION_FREQUENCY_ALTERNATE_ALLELE_FIELD, ""), (float) getDefault(dbObject, POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD, -1.0), (float) getDefault(dbObject, POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD, -1.0), -1.0f, -1.0f, -1.0f)); } } va.setPopulationFrequencies(populationFrequencies); // Gene trait association List<GeneTraitAssociation> geneTraitAssociations = new LinkedList<>(); if (object.containsKey(GENE_TRAIT_FIELD)) { List<Document> list = (List) object.get(GENE_TRAIT_FIELD); for (Document document : list) { geneTraitAssociations.add(new GeneTraitAssociation(getDefault(document, GENE_TRAIT_ID_FIELD, ""), getDefault(document, GENE_TRAIT_NAME_FIELD, ""), getDefault(document, GENE_TRAIT_HPO_FIELD, ""), (float) getDefault(document, GENE_TRAIT_SCORE_FIELD, 0F), 0, //getDefault(document, GENE_TRAIT_PUBMEDS_FIELD, 0), getDefault(document, GENE_TRAIT_TYPES_FIELD, Collections.emptyList()), Collections.emptyList(), getDefault(document, GENE_TRAIT_SOURCE_FIELD, ""))); } } va.setGeneTraitAssociation(geneTraitAssociations); // Drug-Gene Interactions List<GeneDrugInteraction> drugs = new LinkedList<>(); if (object.containsKey(DRUG_FIELD)) { List<Document> list = (List) object.get(DRUG_FIELD); for (Document dbObject : list) { //drugs.add(dbObject.toMap()); drugs.add(new GeneDrugInteraction(getDefault(dbObject, DRUG_GENE_FIELD, ""), getDefault(dbObject, DRUG_NAME_FIELD, ""), getDefault(dbObject, DRUG_SOURCE_FIELD, DEFAULT_DRUB_SOURCE), getDefault(dbObject, DRUG_STUDY_TYPE_FIELD, ""), "")); // "convertToStorageType" stores the study_type within the // DRUG_SOURCE_FIELD } } va.setGeneDrugInteraction(drugs); //XREfs Object xrs = object.get(XREFS_FIELD); if (xrs != null && xrs instanceof List) { List<Xref> xrefs = new LinkedList<>(); for (Object o : (List) xrs) { if (o instanceof Document) { Document xref = (Document) o; xrefs.add(new Xref((String) xref.get(XREF_ID_FIELD), (String) xref.get(XREF_SOURCE_FIELD))); } } va.setXrefs(xrefs); } //Functional score List<Score> functionalScore = new LinkedList<>(); if (object.containsKey(FUNCTIONAL_SCORE)) { List<Document> scores = object.get(FUNCTIONAL_SCORE, List.class); for (Document document : scores) { functionalScore.add(buildScore(document)); } } addScore(object, functionalScore, CADD_SCALED, FUNCTIONAL_CADD_SCALED_FIELD); addScore(object, functionalScore, CADD_RAW, FUNCTIONAL_CADD_RAW_FIELD); va.setFunctionalScore(functionalScore); //Clinical Data if (object.containsKey(CLINICAL_DATA_FIELD)) { va.setVariantTraitAssociation(parseClinicalData((Document) object.get(CLINICAL_DATA_FIELD))); } if (customAnnotation != null) { Map<String, AdditionalAttribute> additionalAttributes = convertAdditionalAttributesToDataModelType( customAnnotation); va.setAdditionalAttributes(additionalAttributes); } return va; } public void addScore(Document object, List<Score> functionalScore, String source, String key) { if (object.containsKey(key)) { Document document = (Document) object.get(key); functionalScore.add(buildScore(source, document)); } } private Score buildScore(Document document) { return buildScore("", document); } private Score buildScore(String source, Document document) { return new Score(getDefault(document, SCORE_SCORE_FIELD, 0.0), getDefault(document, SCORE_SOURCE_FIELD, source), getDefault(document, SCORE_DESCRIPTION_FIELD, "")); } private ConsequenceType buildConsequenceType(String geneName, String ensemblGeneId, String ensemblTranscriptId, String strand, String biotype, Integer exonNumber, List<String> transcriptAnnotationFlags, Integer cDnaPosition, Integer cdsPosition, String codon, List<String> soNameList, ProteinVariantAnnotation proteinVariantAnnotation) { List<SequenceOntologyTerm> soTerms = new ArrayList<>(soNameList.size()); for (String soName : soNameList) { soTerms.add(new SequenceOntologyTerm(ConsequenceTypeMappings.getSoAccessionString(soName), soName)); } return new ConsequenceType(geneName, ensemblGeneId, ensemblTranscriptId, strand, biotype, exonNumber, transcriptAnnotationFlags, cDnaPosition, cdsPosition, codon, proteinVariantAnnotation, soTerms); } private ProteinVariantAnnotation buildProteinVariantAnnotation(String uniprotAccession, String uniprotName, int aaPosition, String aaReference, String aaAlternate, String uniprotVariantId, List<Score> proteinSubstitutionScores, List<String> keywords, List<ProteinFeature> features) { if (areAllEmpty(uniprotAccession, uniprotName, aaPosition, aaReference, aaAlternate, uniprotVariantId, proteinSubstitutionScores, keywords, features)) { return null; } else { return new ProteinVariantAnnotation(uniprotAccession, uniprotName, aaPosition, aaReference, aaAlternate, uniprotVariantId, null, proteinSubstitutionScores, keywords, features); } } private VariantTraitAssociation parseClinicalData(Document clinicalData) { if (clinicalData != null) { int size = 0; VariantTraitAssociation variantTraitAssociation = new VariantTraitAssociation(); List cosmicDBList = (List) clinicalData.get(CLINICAL_COSMIC_FIELD); if (cosmicDBList != null) { List<Cosmic> cosmicList = new ArrayList<>(cosmicDBList.size()); for (Object object : cosmicDBList) { cosmicList.add(jsonObjectMapper.convertValue(object, Cosmic.class)); } size += cosmicList.size(); variantTraitAssociation.setCosmic(cosmicList); } List gwasDBList = (List) clinicalData.get(CLINICAL_GWAS_FIELD); if (gwasDBList != null) { List<Gwas> gwasList = new ArrayList<>(gwasDBList.size()); for (Object object : gwasDBList) { gwasList.add(jsonObjectMapper.convertValue(object, Gwas.class)); } size += gwasList.size(); variantTraitAssociation.setGwas(gwasList); } List clinvarDBList = (List) clinicalData.get(CLINICAL_CLINVAR_FIELD); if (clinvarDBList != null) { List<ClinVar> clinvarList = new ArrayList<>(clinvarDBList.size()); for (Object object : clinvarDBList) { clinvarList.add(jsonObjectMapper.convertValue(object, ClinVar.class)); } size += clinvarList.size(); variantTraitAssociation.setClinvar(clinvarList); } if (size > 0) { return variantTraitAssociation; } else { return null; } } return null; } public Map<String, AdditionalAttribute> convertAdditionalAttributesToDataModelType(Document customAnnotation) { Map<String, AdditionalAttribute> attributeMap = new HashMap<>(); for (String key : customAnnotation.keySet()) { Document document = customAnnotation.get(key, Document.class); HashMap<String, String> map = new HashMap<>(); document.forEach((k, value) -> map.put(k, value.toString())); AdditionalAttribute attribute = new AdditionalAttribute(map); attributeMap.put(key, attribute); } return attributeMap; } @Override public Document convertToStorageType(VariantAnnotation variantAnnotation) { Document document = new Document(); Set<Document> xrefs = new HashSet<>(); List<Document> cts = new LinkedList<>(); //Annotation ID document.put(ANNOT_ID_FIELD, "?"); //Variant ID if (variantAnnotation.getId() != null && !variantAnnotation.getId().isEmpty()) { xrefs.add(convertXrefToStorage(variantAnnotation.getId(), "dbSNP")); } //ConsequenceType if (variantAnnotation.getConsequenceTypes() != null) { Set<String> gnSo = new HashSet<>(); List<ConsequenceType> consequenceTypes = variantAnnotation.getConsequenceTypes(); for (ConsequenceType consequenceType : consequenceTypes) { Document ct = new Document(); putNotNull(ct, CT_GENE_NAME_FIELD, consequenceType.getGeneName()); putNotNull(ct, CT_ENSEMBL_GENE_ID_FIELD, consequenceType.getEnsemblGeneId()); putNotNull(ct, CT_ENSEMBL_TRANSCRIPT_ID_FIELD, consequenceType.getEnsemblTranscriptId()); // putNotNull(ct, RELATIVE_POS_FIELD, consequenceType.getRelativePosition()); putNotNull(ct, CT_CODON_FIELD, consequenceType.getCodon()); putNotDefault(ct, CT_STRAND_FIELD, consequenceType.getStrand(), DEFAULT_STRAND_VALUE); putNotNull(ct, CT_BIOTYPE_FIELD, consequenceType.getBiotype()); putNotNull(ct, CT_EXON_NUMBER_FIELD, consequenceType.getExonNumber()); putNotNull(ct, CT_TRANSCRIPT_ANNOT_FLAGS, consequenceType.getTranscriptAnnotationFlags()); putNotNull(ct, CT_C_DNA_POSITION_FIELD, consequenceType.getCdnaPosition()); putNotNull(ct, CT_CDS_POSITION_FIELD, consequenceType.getCdsPosition()); if (consequenceType.getSequenceOntologyTerms() != null) { List<Integer> soAccession = new LinkedList<>(); for (SequenceOntologyTerm entry : consequenceType.getSequenceOntologyTerms()) { soAccession.add(ConsequenceTypeMappings.termToAccession.get(entry.getName())); } putNotNull(ct, CT_SO_ACCESSION_FIELD, soAccession); for (Integer so : soAccession) { if (StringUtils.isNotEmpty(consequenceType.getGeneName())) { gnSo.add(buildGeneSO(consequenceType.getGeneName(), so)); } if (StringUtils.isNotEmpty(consequenceType.getEnsemblGeneId())) { gnSo.add(buildGeneSO(consequenceType.getEnsemblGeneId(), so)); } if (StringUtils.isNotEmpty(consequenceType.getEnsemblTranscriptId())) { gnSo.add(buildGeneSO(consequenceType.getEnsemblTranscriptId(), so)); } if (consequenceType.getProteinVariantAnnotation() != null) { if (StringUtils.isNotEmpty( consequenceType.getProteinVariantAnnotation().getUniprotAccession())) { gnSo.add(buildGeneSO( consequenceType.getProteinVariantAnnotation().getUniprotAccession(), so)); } if (StringUtils .isNotEmpty(consequenceType.getProteinVariantAnnotation().getUniprotName())) { gnSo.add(buildGeneSO(consequenceType.getProteinVariantAnnotation().getUniprotName(), so)); } } } } //Protein annotation if (consequenceType.getProteinVariantAnnotation() != null) { putNotNull(ct, CT_AA_POSITION_FIELD, consequenceType.getProteinVariantAnnotation().getPosition()); putNotNull(ct, CT_AA_REFERENCE_FIELD, consequenceType.getProteinVariantAnnotation().getReference()); putNotNull(ct, CT_AA_ALTERNATE_FIELD, consequenceType.getProteinVariantAnnotation().getAlternate()); putNotNull(ct, CT_PROTEIN_UNIPROT_ACCESSION, consequenceType.getProteinVariantAnnotation().getUniprotAccession()); putNotNull(ct, CT_PROTEIN_UNIPROT_NAME, consequenceType.getProteinVariantAnnotation().getUniprotName()); putNotNull(ct, CT_PROTEIN_UNIPROT_VARIANT_ID, consequenceType.getProteinVariantAnnotation().getUniprotVariantId()); //Protein substitution region score if (consequenceType.getProteinVariantAnnotation().getSubstitutionScores() != null) { List<Document> proteinSubstitutionScores = new LinkedList<>(); for (Score score : consequenceType.getProteinVariantAnnotation().getSubstitutionScores()) { if (score != null) { if (score.getSource().equals(POLYPHEN)) { putNotNull(ct, CT_PROTEIN_POLYPHEN_FIELD, convertScoreToStorageNoSource(score)); } else if (score.getSource().equals(SIFT)) { putNotNull(ct, CT_PROTEIN_SIFT_FIELD, convertScoreToStorageNoSource(score)); } else { proteinSubstitutionScores.add(convertScoreToStorage(score)); } } } putNotNull(ct, CT_PROTEIN_SUBSTITUTION_SCORE_FIELD, proteinSubstitutionScores); } putNotNull(ct, CT_PROTEIN_KEYWORDS, consequenceType.getProteinVariantAnnotation().getKeywords()); List<ProteinFeature> features = consequenceType.getProteinVariantAnnotation().getFeatures(); if (features != null) { List<Document> documentFeatures = new ArrayList<>(features.size()); for (ProteinFeature feature : features) { Document documentFeature = new Document(); putNotNull(documentFeature, CT_PROTEIN_FEATURE_ID_FIELD, feature.getId()); putNotNull(documentFeature, CT_PROTEIN_FEATURE_START_FIELD, feature.getStart()); putNotNull(documentFeature, CT_PROTEIN_FEATURE_END_FIELD, feature.getEnd()); putNotNull(documentFeature, CT_PROTEIN_FEATURE_TYPE_FIELD, feature.getType()); putNotNull(documentFeature, CT_PROTEIN_FEATURE_DESCRIPTION_FIELD, feature.getDescription()); documentFeatures.add(documentFeature); } putNotNull(ct, CT_PROTEIN_FEATURE_FIELD, documentFeatures); } if (StringUtils .isNotEmpty(consequenceType.getProteinVariantAnnotation().getUniprotAccession())) { xrefs.add(convertXrefToStorage( consequenceType.getProteinVariantAnnotation().getUniprotAccession(), "UniProt")); } if (StringUtils.isNotEmpty(consequenceType.getProteinVariantAnnotation().getUniprotName())) { xrefs.add(convertXrefToStorage( consequenceType.getProteinVariantAnnotation().getUniprotName(), "UniProt")); } if (StringUtils .isNotEmpty(consequenceType.getProteinVariantAnnotation().getUniprotVariantId())) { xrefs.add(convertXrefToStorage( consequenceType.getProteinVariantAnnotation().getUniprotVariantId(), "UniProt")); } } cts.add(ct); if (StringUtils.isNotEmpty(consequenceType.getGeneName())) { xrefs.add(convertXrefToStorage(consequenceType.getGeneName(), "HGNC")); } if (StringUtils.isNotEmpty(consequenceType.getEnsemblGeneId())) { xrefs.add(convertXrefToStorage(consequenceType.getEnsemblGeneId(), "ensemblGene")); } if (StringUtils.isNotEmpty(consequenceType.getEnsemblTranscriptId())) { xrefs.add(convertXrefToStorage(consequenceType.getEnsemblTranscriptId(), "ensemblTranscript")); } } putNotNull(document, GENE_SO_FIELD, gnSo); putNotNull(document, CONSEQUENCE_TYPE_FIELD, cts); } //Conserved region score if (variantAnnotation.getConservation() != null) { List<Document> conservedRegionScores = new LinkedList<>(); for (Score score : variantAnnotation.getConservation()) { if (score != null) { if (score.getSource().equals(PHYLOP)) { putNotNull(document, CONSERVED_REGION_PHYLOP_FIELD, convertScoreToStorageNoSource(score)); } else if (score.getSource().equals(PHAST_CONS)) { putNotNull(document, CONSERVED_REGION_PHASTCONS_FIELD, convertScoreToStorageNoSource(score)); } else if (score.getSource().equals(GERP)) { putNotNull(document, CONSERVED_REGION_GERP_FIELD, convertScoreToStorageNoSource(score)); } else { conservedRegionScores.add(convertScoreToStorage(score)); } } } putNotNull(document, CONSERVED_REGION_SCORE_FIELD, conservedRegionScores); } // Gene trait association if (variantAnnotation.getGeneTraitAssociation() != null) { List<Document> geneTraitAssociations = new LinkedList<>(); for (GeneTraitAssociation geneTraitAssociation : variantAnnotation.getGeneTraitAssociation()) { if (geneTraitAssociation != null) { Document d = new Document(); putNotNull(d, GENE_TRAIT_ID_FIELD, geneTraitAssociation.getId()); putNotNull(d, GENE_TRAIT_NAME_FIELD, geneTraitAssociation.getName()); putNotNull(d, GENE_TRAIT_SCORE_FIELD, geneTraitAssociation.getScore()); putNotNull(d, GENE_TRAIT_HPO_FIELD, geneTraitAssociation.getHpo()); if (StringUtils.isNotEmpty(geneTraitAssociation.getHpo())) { xrefs.add(convertXrefToStorage(geneTraitAssociation.getHpo(), "HPO")); } // putNotNull(d, GENE_TRAIT_PUBMEDS_FIELD, geneTraitAssociation.getNumberOfPubmeds()); putNotNull(d, GENE_TRAIT_TYPES_FIELD, geneTraitAssociation.getAssociationTypes()); // putNotNull(d, GENE_TRAIT_SOURCES_FIELD, geneTraitAssociation.getSources()); putNotNull(d, GENE_TRAIT_SOURCE_FIELD, geneTraitAssociation.getSource()); geneTraitAssociations.add(d); } } putNotNull(document, GENE_TRAIT_FIELD, geneTraitAssociations); } //Population frequencies if (variantAnnotation.getPopulationFrequencies() != null) { List<Document> populationFrequencies = new LinkedList<>(); for (PopulationFrequency populationFrequency : variantAnnotation.getPopulationFrequencies()) { if (populationFrequency != null) { populationFrequencies.add(convertPopulationFrequencyToStorage(populationFrequency)); } } putNotNull(document, POPULATION_FREQUENCIES_FIELD, populationFrequencies); } // Drug-Gene Interactions if (variantAnnotation.getGeneDrugInteraction() != null) { List<Document> drugGeneInteractions = new LinkedList<>(); List<GeneDrugInteraction> geneDrugInteractionList = variantAnnotation.getGeneDrugInteraction(); if (geneDrugInteractionList != null) { for (GeneDrugInteraction geneDrugInteraction : geneDrugInteractionList) { Document drugDbObject = new Document(DRUG_GENE_FIELD, geneDrugInteraction.getGeneName()); putNotNull(drugDbObject, DRUG_NAME_FIELD, geneDrugInteraction.getDrugName()); putNotDefault(drugDbObject, DRUG_SOURCE_FIELD, geneDrugInteraction.getSource(), DEFAULT_DRUB_SOURCE); putNotNull(drugDbObject, DRUG_STUDY_TYPE_FIELD, geneDrugInteraction.getStudyType()); drugGeneInteractions.add(drugDbObject); } } putNotNull(document, DRUG_FIELD, drugGeneInteractions); } //XREFs if (variantAnnotation.getXrefs() != null) { for (Xref xref : variantAnnotation.getXrefs()) { xrefs.add(convertXrefToStorage(xref.getId(), xref.getSource())); } } putNotNull(document, XREFS_FIELD, xrefs); //Functional score if (variantAnnotation.getFunctionalScore() != null) { List<Document> scores = new ArrayList<>(variantAnnotation.getFunctionalScore().size()); for (Score score : variantAnnotation.getFunctionalScore()) { if (score != null) { if (score.getSource().equals(CADD_RAW)) { putNotNull(document, FUNCTIONAL_CADD_RAW_FIELD, convertScoreToStorageNoSource(score)); } else if (score.getSource().equals(CADD_SCALED)) { putNotNull(document, FUNCTIONAL_CADD_SCALED_FIELD, convertScoreToStorageNoSource(score)); } else { scores.add(convertScoreToStorage(score)); } } } putNotNull(document, FUNCTIONAL_SCORE, scores); } //Clinical Data Document clinicalDocument = new Document(); if (variantAnnotation.getVariantTraitAssociation() != null) { putNotNull(clinicalDocument, CLINICAL_COSMIC_FIELD, generateClinicalDBList(variantAnnotation.getVariantTraitAssociation().getCosmic())); if (variantAnnotation.getVariantTraitAssociation().getCosmic() != null) { variantAnnotation.getVariantTraitAssociation().getCosmic().stream().map(Cosmic::getMutationId) .filter(StringUtils::isNotEmpty) .forEach(mutationId -> xrefs.add(convertXrefToStorage(mutationId, "COSMIC"))); } putNotNull(clinicalDocument, CLINICAL_GWAS_FIELD, generateClinicalDBList(variantAnnotation.getVariantTraitAssociation().getGwas())); putNotNull(clinicalDocument, CLINICAL_CLINVAR_FIELD, generateClinicalDBList(variantAnnotation.getVariantTraitAssociation().getClinvar())); if (variantAnnotation.getVariantTraitAssociation().getClinvar() != null) { variantAnnotation.getVariantTraitAssociation().getClinvar().stream().map(ClinVar::getAccession) .filter(StringUtils::isNotEmpty) .forEach(accession -> xrefs.add(convertXrefToStorage(accession, "ClinVar"))); } } if (!clinicalDocument.isEmpty()) { document.put(CLINICAL_DATA_FIELD, clinicalDocument); } return document; } public static String buildGeneSO(String gene, Integer so) { return gene == null ? null : gene + '_' + so; } private <T> List<Document> generateClinicalDBList(List<T> objectList) { if (objectList != null) { List<Document> list = new ArrayList<>(objectList.size()); for (T object : objectList) { try { if (object instanceof GenericRecord) { list.add(Document.parse(object.toString())); } else { list.add(Document.parse(writer.writeValueAsString(object))); } } catch (JsonProcessingException e) { e.printStackTrace(); logger.error("Error serializing Clinical Data " + object.getClass(), e); } } return list; } return null; } public Document convertScoreToStorageNoSource(Score score) { return convertScoreToStorage(score.getScore(), null, score.getDescription()); } private Document convertScoreToStorage(Score score) { return convertScoreToStorage(score.getScore(), score.getSource(), score.getDescription()); } private Document convertScoreToStorage(double score, String source, String description) { Document dbObject = new Document(SCORE_SCORE_FIELD, score); putNotNull(dbObject, SCORE_SOURCE_FIELD, source); putNotNull(dbObject, SCORE_DESCRIPTION_FIELD, description); return dbObject; } private Document convertPopulationFrequencyToStorage(PopulationFrequency populationFrequency) { Document dbObject = new Document(POPULATION_FREQUENCY_STUDY_FIELD, populationFrequency.getStudy()); putNotNull(dbObject, POPULATION_FREQUENCY_POP_FIELD, populationFrequency.getPopulation()); putNotNull(dbObject, POPULATION_FREQUENCY_REFERENCE_FREQUENCY_FIELD, populationFrequency.getRefAlleleFreq()); putNotNull(dbObject, POPULATION_FREQUENCY_ALTERNATE_FREQUENCY_FIELD, populationFrequency.getAltAlleleFreq()); return dbObject; } private Document convertXrefToStorage(String id, String source) { Document dbObject = new Document(XREF_ID_FIELD, id); dbObject.put(XREF_SOURCE_FIELD, source); return dbObject; } public Document convertToStorageType(Map<String, AdditionalAttribute> attributes) { Document document = new Document(); attributes.forEach((key, attribute) -> { document.put(key, convertToStorageType(attribute)); }); return document; } public static Document convertToStorageType(AdditionalAttribute attribute) { Document document = new Document(); document.putAll(attribute.getAttribute()); return document; } }