Java tutorial
/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.biodata.tools.variant.converter; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.variant.annotation.VepParser; import org.opencb.biodata.models.feature.Genotype; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.VariantVcfFactory; import org.opencb.biodata.models.variant.avro.*; import org.opencb.biodata.models.variant.stats.VariantStats; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; import static org.opencb.biodata.models.variant.StudyEntry.isSamplesPositionMapSorted; /** * @author Pawan Pal & Kalyan * */ public class VariantContextToVariantConverter implements Converter<VariantContext, Variant>, Serializable { private final String studyId; private final String fileId; private LinkedHashMap<String, Integer> samplesPosition; private List<String> consequenceTypeFields; protected Logger logger = LoggerFactory.getLogger(this.getClass().toString()); VariantContextToVariantConverter() { this("", "", null); } @Deprecated public VariantContextToVariantConverter(String studyId, String fileId) { this(studyId, fileId, null); } public VariantContextToVariantConverter(String studyId, String fileId, List<String> samples) { this.studyId = studyId; this.fileId = fileId; // TODO this must be parsed from VCF header consequenceTypeFields = Arrays.asList(); samplesPosition = createSamplesPositionMap(samples); } private static LinkedHashMap<String, Integer> createSamplesPositionMap(List<String> samples) { if (samples == null) { return null; } LinkedHashMap<String, Integer> samplesPosition = new LinkedHashMap<>(); int position = 0; for (String sample : samples) { samplesPosition.put(sample, position++); } isSamplesPositionMapSorted(samplesPosition); return samplesPosition; } @Override public Variant convert(VariantContext variantContext) { return convert(variantContext, new Variant()); } /** * * @param variantContext * @param reuse an instance to reuse. * @return */ public Variant convert(VariantContext variantContext, Variant reuse) { Variant variant = reuse; variant.setChromosome(variantContext.getContig()); variant.setStart(variantContext.getStart()); variant.setEnd(variantContext.getEnd()); // Setting reference and alternate alleles variant.setReference(variantContext.getReference().getDisplayString()); List<Allele> alternateAlleleList = variantContext.getAlternateAlleles(); if (alternateAlleleList != null && !alternateAlleleList.isEmpty()) { variant.setAlternate(alternateAlleleList.get(0).toString()); } else { alternateAlleleList = Collections.emptyList(); variant.setAlternate(""); } //Do not need to store dot ID. It means that this variant does not have any ID String[] idsArray = variantContext.getID().split(VCFConstants.ID_FIELD_SEPARATOR); List<String> ids = new ArrayList<>(idsArray.length); for (String id : idsArray) { if (!id.equals(".")) { ids.add(id); } } variant.setIds(ids); variant.resetLength(); // variant.setLength(Math.max(variant.getReference().length(), variant.getAlternate().length())); // TODO Nacho please add CNV when symbolic final VariantType type; if (!variantContext.getType().equals(VariantContext.Type.NO_VARIATION)) { type = Variant.inferType(variant.getReference(), variant.getAlternate(), variant.getLength()); } else { type = VariantType.NO_VARIATION; } variant.setType(type); // variant.setType(getEnumFromString(VariantType.class, variantContext.getType().toString())); // VariantType variantType = getEnumFromString(VariantType.class, variantContext.getType().toString()); // switch (variantType) { // case SNP: // if (variant.getIds().isEmpty()) { // variant.setType(VariantType.SNV); // } else { // variant.setType(VariantType.SNP); // } // break; // case INDEL: // if (variant.getLength() > Variant.SV_THRESHOLD) { // if (variant.getReference().isEmpty()) { // variant.setType(VariantType.INSERTION); // } else if (variant.getAlternate().isEmpty()) { // variant.setType(VariantType.DELETION); // } else { // variant.setType(VariantType.SV); // } // } else { // variant.setType(VariantType.INDEL); // } // break; // default: // variant.setType(variantType); // } // variant.resetHGVS(); // set variantSourceEntry fields List<StudyEntry> studies = new ArrayList<>(); StudyEntry studyEntry = new StudyEntry(); // For time being setting the hard coded values for FileId and Study ID studyEntry.setStudyId(studyId); FileEntry fileEntry = new FileEntry(); fileEntry.setFileId(fileId); fileEntry.setCall(""); Map<String, String> attributes = new HashMap<>(); for (String key : variantContext.getAttributes().keySet()) { // Do not use "getAttributeAsString" for lists. // It will add brackets surrounding the values if (variantContext.getAttribute(key, "") instanceof List) { attributes.put(key, StringUtils.join(variantContext.getAttributeAsList(key), VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); } else { attributes.put(key, variantContext.getAttributeAsString(key, "")); } } // QUAL if (variantContext.getLog10PError() != VariantContext.NO_LOG10_PERROR) { attributes.put(VariantVcfFactory.QUAL, Double.toString(variantContext.getPhredScaledQual())); } // FILTER Set<String> filter = variantContext.getFiltersMaybeNull(); if (filter == null) { attributes.put(VariantVcfFactory.FILTER, VCFConstants.UNFILTERED); } else if (filter.isEmpty()) { attributes.put(VariantVcfFactory.FILTER, VCFConstants.PASSES_FILTERS_v4); } else { if (filter.size() == 1) { attributes.put(VariantVcfFactory.FILTER, filter.iterator().next()); } else { attributes.put(VariantVcfFactory.FILTER, filter.stream().sorted().collect(Collectors.joining(VCFConstants.FILTER_CODE_SEPARATOR))); } } fileEntry.setAttributes(attributes); studyEntry.setFiles(Collections.singletonList(fileEntry)); // We need to convert Allele object to String // We skip the first alternate allele since these are the secondaries List<AlternateCoordinate> secondaryAlternateList = new ArrayList<>( Math.max(alternateAlleleList.size() - 1, 0)); List<String> alternates = new ArrayList<>(alternateAlleleList.size()); if (alternateAlleleList.size() > 0) { alternates.add(alternateAlleleList.get(0).toString()); } for (int i = 1; i < alternateAlleleList.size(); i++) { String allele = alternateAlleleList.get(i).toString(); alternates.add(allele); secondaryAlternateList.add(new AlternateCoordinate(null, null, null, null, allele, variant.getType())); // secondaryAlternateList.add(new AlternateCoordinate(null, null, null, null, allele, variantType)); } studyEntry.setSecondaryAlternates(secondaryAlternateList); // set variant format // FIXME: This code is not respecting the original format order List<String> formatFields = new ArrayList<>(10); if (!variantContext.getGenotypes().isEmpty()) { htsjdk.variant.variantcontext.Genotype gt = variantContext.getGenotypes().get(0); //FullVCFCodec saves ALL the format fields in the ExtendedAttributes map. for (String key : gt.getExtendedAttributes().keySet()) { if (key.equals(VCFConstants.GENOTYPE_KEY)) { //GT must be the first one formatFields.add(0, key); } else { formatFields.add(key); } } } studyEntry.setFormat(formatFields); if (samplesPosition == null) { logger.warn("Using alphabetical order for samples position!"); samplesPosition = createSamplesPositionMap(variantContext.getSampleNamesOrderedByName()); } List<List<String>> sampleDataList = new ArrayList<>(samplesPosition.size()); for (String sampleName : samplesPosition.keySet()) { htsjdk.variant.variantcontext.Genotype genotype = variantContext.getGenotype(sampleName); List<String> sampleList = new ArrayList<>(formatFields.size()); for (String formatField : formatFields) { final String value; switch (formatField) { case VCFConstants.GENOTYPE_KEY: String genotypeValue; // if (variantType.equals(VariantType.SYMBOLIC)) { if (variant.getType().equals(VariantType.SYMBOLIC) || variant.getType().equals(VariantType.CNV)) { genotypeValue = genotype.getGenotypeString(false).replaceAll("\\*", ""); } else { genotypeValue = genotype.getGenotypeString(true); } // sometimes (FreeBayes) a single '.' is written for some samples if (genotypeValue.equals(".")) { value = "./."; } else { value = new Genotype(genotypeValue, variant.getReference(), alternates).toString(); } break; default: Object attribute = genotype.getAnyAttribute(formatField); if (attribute != null) { if (attribute instanceof Collection) { value = ((List<Object>) attribute).stream().map(Object::toString) .collect(Collectors.joining(",")); } else { value = attribute.toString(); } } else { //Can hts return null fields? //ABSOLUTELY, for missing values value = "."; } break; } sampleList.add(value); } sampleDataList.add(sampleList); } studyEntry.setSamplesData(sampleDataList); studyEntry.setSamplesPosition(samplesPosition); /* * set stats fields. Putting hard coded values for time * being as these value will not be getting from HTSJDK * currently. */ Map<String, VariantStats> stats = new HashMap<>(); //TODO: Call to the Variant Aggregated Stats Parser // stats.put( // "2", // setVariantStatsParams( // setVariantHardyWeinbergStatsParams(), // variantContext)); studyEntry.setStats(stats); studies.add(studyEntry); variant.setStudies(studies); // set VariantAnnotation parameters // TODO: Read annotation from info column if (consequenceTypeFields != null && !consequenceTypeFields.isEmpty()) { variant.setAnnotation(VepParser.parseInfoCsq(consequenceTypeFields, variantContext.getAttributes().get("CSQ").toString())); } return variant; } /** * method to set Consequence Type Parameters * @return consequenceTypeList */ private List<ConsequenceType> setConsequenceTypeParams() { List<ConsequenceType> consequenceTypeList = new ArrayList<>(); ConsequenceType consequenceType = new ConsequenceType(); consequenceType.setGeneName(null); consequenceType.setEnsemblGeneId(null); consequenceType.setEnsemblTranscriptId(null); consequenceType.setStrand(null); consequenceType.setBiotype(null); consequenceType.setCdnaPosition(null); consequenceType.setCdsPosition(null); consequenceType.setCodon(null); /* * set ExpressionValues list type parameter */ // List<ExpressionValue> expressionValueList = new ArrayList<>(); // ExpressionValue expressionValue = new ExpressionValue(); // expressionValue.setExpression(getEnumFromString(org.opencb.biodata.models.variant.avro.ExpressionCall.class, "UP")); /*expressionValue.setExperimentalFactor(null); expressionValue.setExperimentId(null); expressionValue.setExpression(null); expressionValue.setFactorValue(null); expressionValue.setPvalue(null); expressionValue.setTechnologyPlatform(null);*/ // expressionValueList.add(expressionValue); // consequenceType.setExpression(expressionValueList); /* * set ProteinSubstitutionScores list type parameter */ // List<Score> proteinSubstitutionScoreList = new ArrayList<>(); // Score score = new Score(null, null, null); // proteinSubstitutionScoreList.add(score); ProteinVariantAnnotation proteinVariantAnnotation = new ProteinVariantAnnotation(); proteinVariantAnnotation.setSubstitutionScores(Collections.emptyList()); consequenceType.setProteinVariantAnnotation(proteinVariantAnnotation); /* * set SoTerms list type parameter */ List<SequenceOntologyTerm> sequenceOntologyTerms = new ArrayList<>(); SequenceOntologyTerm sequenceOntologyTerm = new SequenceOntologyTerm(); sequenceOntologyTerm.setAccession(null); sequenceOntologyTerm.setName(null); sequenceOntologyTerms.add(sequenceOntologyTerm); consequenceType.setSequenceOntologyTerms(sequenceOntologyTerms); consequenceType.setStrand(null); /* * Add consequenceType final bean to list */ consequenceTypeList.add(consequenceType); return consequenceTypeList; } /** * method to set Population Frequency Parameters * @return populationFrequencyList */ private List<PopulationFrequency> setPopulationFrequencyParams() { List<PopulationFrequency> populationFrequencyList = new ArrayList<>(); PopulationFrequency populationFrequency = new PopulationFrequency(); populationFrequency.setAltAllele(null); populationFrequency.setAltAlleleFreq(null); populationFrequency.setAltHomGenotypeFreq(null); populationFrequency.setHetGenotypeFreq(null); populationFrequency.setPopulation(null); populationFrequency.setRefAllele(null); populationFrequency.setRefAlleleFreq(null); populationFrequency.setRefHomGenotypeFreq(null); populationFrequency.setStudy(null); // populationFrequency.setSuperPopulation(null); populationFrequencyList.add(populationFrequency); return populationFrequencyList; } /** * method to set Varaint Annotation Parameters * @return variantAnnotation */ private VariantAnnotation setVaraintAnnotationParams() { VariantAnnotation variantAnnotation = new VariantAnnotation(); /* * set AdditionalAttributes map type parameter */ Map<String, AdditionalAttribute> additionalAttributesMap = new HashMap<>(); //additionalAttributesMap.put(null, null); variantAnnotation.setAdditionalAttributes(additionalAttributesMap); /* * set AlternateAllele parameter */ variantAnnotation.setAlternate(null); /* * set CaddScore list type parameter */ // List<CaddScore> caddScoreList = new ArrayList<>(); // CaddScore caddScore = new CaddScore(); /*caddScore.setCScore(null); caddScore.setRawScore(null); caddScore.setTranscriptId(null);*/ // caddScoreList.add(caddScore); // variantAnnotation.setCaddScore(caddScoreList); /* * set Chromosome parameter */ variantAnnotation.setChromosome(null); /* * set Clinical map type parameter */ variantAnnotation.setVariantTraitAssociation( new VariantTraitAssociation(Arrays.asList(), Arrays.asList(), Arrays.asList())); /* * set ConsequenceTypes list type parameter */ variantAnnotation.setConsequenceTypes(setConsequenceTypeParams()); /* * set ConservationScores list type parameter */ List<Score> conservationScoreList = new ArrayList<>(); Score score = new Score(); /*score.setDescription(null); score.setScore(null); score.setSource(null); */ conservationScoreList.add(score); variantAnnotation.setConservation(conservationScoreList); // variantAnnotation.setEnd(0); /* * set GeneDrugInteraction map of list type parameter */ // Map<String, List<String>> geneDrugInteractionMap = new HashMap<>(); List<GeneDrugInteraction> geneDrugInteractionList = new ArrayList<>(); // List<String> geneDrugInteractionList = new ArrayList<>(); //geneDrugInteractionList.add("AAA"); //geneDrugInteractionMap.put("000", geneDrugInteractionList); variantAnnotation.setGeneDrugInteraction(geneDrugInteractionList); /* * set Hgvs list type parameter */ List<String> hgvsList = new ArrayList<>(); //hgvsList.add(null); variantAnnotation.setHgvs(hgvsList); variantAnnotation.setId(null); /* * set PopulationFrequencies list type parameter */ variantAnnotation.setPopulationFrequencies(setPopulationFrequencyParams()); variantAnnotation.setReference(null); variantAnnotation.setStart(0); /* * set Xref list type parameter */ List<Xref> xrefsList = new ArrayList<>(); Xref xref = new Xref(); /*xref.setId(null); xref.setSrc(null);*/ xrefsList.add(xref); variantAnnotation.setXrefs(xrefsList); /* * return variantAnnotation bean */ return variantAnnotation; } /** * method to set Variant Stats Parameters * @param variantHardyWeinbergStats * @param variantContext * @return variantStats */ private VariantStats setVariantStatsParams(VariantHardyWeinbergStats variantHardyWeinbergStats, VariantContext variantContext) { VariantStats variantStats = new VariantStats(); variantStats.setAltAllele("aa"); variantStats.setAltAlleleCount(1); variantStats.setAltAlleleFreq(2.1f); variantStats.setCasesPercentDominant(3.1f); variantStats.setCasesPercentRecessive(5.1f); variantStats.setControlsPercentDominant(1.0f); variantStats.setControlsPercentRecessive(3.1f); variantStats.setMaf(4f); variantStats.setMafAllele("ss"); variantStats.setMendelianErrors(4); variantStats.setMgf(3f); variantStats.setMgfGenotype("AA"); variantStats.setMissingAlleles(3); variantStats.setMissingGenotypes(3); variantStats.setNumSamples(4); variantStats.setPassedFilters(true); variantStats.setQuality((float) variantContext.getPhredScaledQual()); variantStats.setRefAllele("SS"); variantStats.setRefAlleleCount(4); variantStats.setRefAlleleFreq(2f); variantStats.setHw(variantHardyWeinbergStats); variantStats.setVariantType(getEnumFromString(VariantType.class, variantContext.getType().toString())); return variantStats; } /** * method to set VariantHardyWeinberg Stats Parameters * @return variantHardyWeinbergStats */ private VariantHardyWeinbergStats setVariantHardyWeinbergStatsParams() { VariantHardyWeinbergStats variantHardyWeinbergStats = new VariantHardyWeinbergStats(); variantHardyWeinbergStats.setChi2(1f); variantHardyWeinbergStats.setEAa00(2f); variantHardyWeinbergStats.setEAa10(3f); variantHardyWeinbergStats.setEAA11(4f); variantHardyWeinbergStats.setN(1); variantHardyWeinbergStats.setNAa00(2); variantHardyWeinbergStats.setNAa10(3); variantHardyWeinbergStats.setNAA11(4); variantHardyWeinbergStats.setP(1f); variantHardyWeinbergStats.setQ(2f); return variantHardyWeinbergStats; } /** * @param variantType * @param string * @return */ private static <E extends Enum<E>> E getEnumFromString(Class<E> variantType, String string) { if (variantType != null && string != null) { try { return Enum.valueOf(variantType, string.trim().toUpperCase()); } catch (IllegalArgumentException e) { throw new IllegalArgumentException("Unknown variantType " + string); } } return null; } }