Java tutorial
/* * Copyright 2016 EMBL - European Bioinformatics Institute * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package uk.ac.ebi.eva.pipeline.io.mappers; import org.apache.commons.lang.ArrayUtils; import org.opencb.biodata.models.variant.annotation.ConsequenceType; import org.opencb.biodata.models.variant.annotation.Score; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.batch.item.file.LineMapper; import uk.ac.ebi.eva.commons.models.data.VariantAnnotation; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; /** * Map a line in VEP output file to {@link VariantAnnotation} * * Example of VEP output line * 20_60343_G/A 20:60343 A - - - intergenic_variant - - - - - - * 20_60419_A/G 20:60419 G - - - intergenic_variant - - - - - - * 20_60479_C/T 20:60479 T - - - intergenic_variant - - - - - rs149529999 GMAF=T:0.0018;AFR_MAF=T:0.01;AMR_MAF=T:0.0028 * 20_60523_-/C 20:60522-60523 C - - - intergenic_variant - - - - - rs150241001 GMAF=C:0.0115;AFR_MAF=C:0.05;AMR_MAF=C:0.0028 * * Please note that most of the code is from org.opencb.biodata.formats.annotation.io.VepFormatReader * public methods in VepFormatReader can't be reused because there is a reference to the previous line (currentVariantString) * that prevent each line to be independent * * Here each line is mapped to {@link VariantAnnotation}; in case of two annotations for the same variant, a new * {@link VariantAnnotation} object is created containing only the fields that will be appended: * - ConsequenceTypes * - Hgvs */ public class AnnotationLineMapper implements LineMapper<VariantAnnotation> { private static final Logger logger = LoggerFactory.getLogger(AnnotationLineMapper.class); /** * Map a line in VEP output file to {@link VariantAnnotation} * @param line in VEP output * @param lineNumber * @return a {@link VariantAnnotation} * @throws Exception * * Most of the code is from org.opencb.biodata.formats.annotation.io.VepFormatReader#read() with few differences: * - An empty array is initialized for Hgvs (like ConsequenceTypes); * - parseFrequencies is always true and the all line is always parsed; * - The logic to move around the file (read line) and reference to previous line (currentVariantString) are removed; */ @Override public VariantAnnotation mapLine(String line, int lineNumber) { //logger.debug("Mapping line {} to VariantAnnotation", line); ConsequenceType consequenceType = new ConsequenceType(); String[] lineFields = line.split("\t"); Map<String, String> variantMap = parseVariant(lineFields[0], lineFields[1]); // coordinates and alternative are only parsed once VariantAnnotation currentAnnotation = new VariantAnnotation(variantMap.get("chromosome"), Integer.valueOf(variantMap.get("start")), Integer.valueOf(variantMap.get("end")), variantMap.get("reference"), variantMap.get("alternative")); /** * parses extra column and populates fields as required. * Some lines do not have extra field and end with a \t: the split function above does not return that field */ if (lineFields.length == 14) { parseExtraField(consequenceType, lineFields[13], currentAnnotation); } // Remaining fields only of interest if the feature is a transcript if (lineFields[5].toLowerCase().equals("transcript")) { parseTranscriptFields(consequenceType, lineFields); // Otherwise just set SO terms } else { consequenceType.setSoTermsFromSoNames(Arrays.asList(lineFields[6].split(","))); // fill so terms } currentAnnotation.getConsequenceTypes().add(consequenceType); return currentAnnotation; } /** * From org.opencb.biodata.formats.annotation.io.VepFormatReader * #parseRemainingFields(org.opencb.biodata.models.variant.annotation.ConsequenceType, java.lang.String[]) */ private void parseTranscriptFields(ConsequenceType consequenceType, String[] lineFields) { consequenceType.setEnsemblGeneId(lineFields[3]); // fill Ensembl gene id consequenceType.setEnsemblTranscriptId(lineFields[4]); // fill Ensembl transcript id if (!lineFields[6].equals("") && !lineFields[6].equals("-")) { // VEP may leave this field empty consequenceType.setSoTermsFromSoNames(Arrays.asList(lineFields[6].split(","))); // fill so terms } if (!lineFields[7].equals("-")) { consequenceType.setcDnaPosition(parseStringInterval(lineFields[7])); // fill cdna position } if (!lineFields[8].equals("-")) { consequenceType.setCdsPosition(parseStringInterval(lineFields[8])); // fill cds position } if (!lineFields[9].equals("-")) { consequenceType.setAaPosition(parseStringInterval(lineFields[9])); // fill aa position } consequenceType.setAaChange(lineFields[10]); // fill aa change consequenceType.setCodon(lineFields[11]); // fill codon change } /** * From org.opencb.biodata.formats.annotation.io.VepFormatReader#parseStringInterval(java.lang.String) */ private Integer parseStringInterval(String stringInterval) { String[] parts = stringInterval.split("-"); if (!parts[0].equals("?")) { return Integer.valueOf(parts[0]); } else if (parts.length > 1 && !parts[1].equals("?")) { return Integer.valueOf(parts[1]); } else { return null; } } /** * From org.opencb.biodata.formats.annotation.io.VepFormatReader#parseVariant(java.lang.String, java.lang.String) */ private Map<String, String> parseVariant(String variantString, String coordinatesString) { // private Map<String,String> parseVariant(String coordinatesString, String alternativeString) { Map<String, String> parsedVariant = new HashMap<>(5); try { String[] variantLocationFields = coordinatesString.split("[:-]"); // parsedVariant.put("chromosome", variantLocationFields[0]); // parsedVariant.put("start", variantLocationFields[1]); parsedVariant.put("end", (variantLocationFields.length > 2) ? variantLocationFields[2] : variantLocationFields[1]); } catch (ArrayIndexOutOfBoundsException e) { logger.error("Unexpected format for column 2: " + coordinatesString); throw e; } try { // Some VEP examples: // 1_718787_-/T 1:718786-718787 T ... // 1_718787_T/- 1:718787 - ... // 1_718788_T/A 1:718788 A ... String[] variantFields = variantString.split("[\\/]"); // String[] variantFields = variantString.split("[\\_\\/]"); String[] leftVariantFields = variantFields[0].split("_"); // Chr id containing _ if (leftVariantFields.length > 3) { parsedVariant.put("chromosome", String.join("_", (String[]) ArrayUtils.subarray(leftVariantFields, 0, leftVariantFields.length - 2))); } else { parsedVariant.put("chromosome", leftVariantFields[0]); } parsedVariant.put("start", leftVariantFields[leftVariantFields.length - 2]); parsedVariant.put("reference", leftVariantFields[leftVariantFields.length - 1]); parsedVariant.put("alternative", variantFields[1]); } catch (ArrayIndexOutOfBoundsException e) { logger.error("Unexpected variant format for column 1: " + variantString); throw e; } return parsedVariant; } /** * From org.opencb.biodata.formats.annotation.io.VepFormatReader * #parseExtraField(org.opencb.biodata.models.variant.annotation.ConsequenceType, java.lang.String, java.lang.Boolean) * * The parseFrequencies option has been removed */ private void parseExtraField(ConsequenceType consequenceType, String extraField, VariantAnnotation currentAnnotation) { for (String field : extraField.split(";")) { String[] keyValue = field.split("="); switch (keyValue[0].toLowerCase()) { case "biotype": consequenceType.setBiotype(keyValue[1]); break; case "hgvsc": currentAnnotation.getHgvs().add(keyValue[1]); break; case "hgvsp": currentAnnotation.getHgvs().add(keyValue[1]); break; case "polyphen": // Format is PolyPhen=possibly_damaging(0.859) consequenceType.addProteinSubstitutionScore(parseProteinSubstitutionScore("Polyphen", keyValue[1])); break; case "sift": // Format is SIFT=tolerated(0.07) consequenceType.addProteinSubstitutionScore(parseProteinSubstitutionScore("Sift", keyValue[1])); break; case "strand": consequenceType.setStrand(keyValue[1].equals("1") ? "+" : "-"); break; case "symbol": consequenceType.setGeneName(keyValue[1]); break; default: // ALLELE_NUM, FREQS, IND, ZYG break; } } } /** * From org.opencb.biodata.formats.annotation.io.VepFormatReader * #parseProteinSubstitutionScore(java.lang.String, java.lang.String) */ private Score parseProteinSubstitutionScore(String predictorName, String scoreString) { String[] scoreFields = scoreString.split("[\\(\\)]"); return new Score(Double.valueOf(scoreFields[1]), predictorName, scoreFields[0]); } }