Java tutorial
/* * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no * obligations to provide maintenance, support, updates, enhancements or * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be * liable to any party for direct, indirect, special, incidental or * consequential damages, including lost profits, arising out of the use of this * software and its documentation, even if Memorial Sloan-Kettering Cancer * Center has been advised of the possibility of such damage. */ /* * This file is part of cBioPortal. * * cBioPortal is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.mskcc.cbio.portal.scripts; import org.mskcc.cbio.portal.dao.*; import org.mskcc.cbio.portal.model.*; import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent; import org.mskcc.cbio.portal.util.*; import org.mskcc.cbio.maf.*; import org.apache.commons.lang.StringUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.*; /** * Import an extended mutation file. * Columns may be in any order. * <p> * @author Ethan Cerami * <br> * @author Arthur Goldberg goldberg@cbio.mskcc.org * <br> * @author Selcuk Onur Sumer */ public class ImportExtendedMutationData { private File mutationFile; private int geneticProfileId; private MutationFilter myMutationFilter; private int entriesSkipped = 0; private int samplesSkipped = 0; private Set<String> sampleSet = new HashSet<String>(); private Set<String> geneSet = new HashSet<String>(); /** * construct an ImportExtendedMutationData. * Filter mutations according to the no argument MutationFilter(). */ public ImportExtendedMutationData(File mutationFile, int geneticProfileId) { this.mutationFile = mutationFile; this.geneticProfileId = geneticProfileId; // create default MutationFilter myMutationFilter = new MutationFilter(); } public void importData() throws IOException, DaoException { MySQLbulkLoader.bulkLoadOn(); HashSet<String> sequencedCaseSet = new HashSet<String>(); Map<MutationEvent, MutationEvent> existingEvents = new HashMap<MutationEvent, MutationEvent>(); for (MutationEvent event : DaoMutation.getAllMutationEvents()) { existingEvents.put(event, event); } Set<MutationEvent> newEvents = new HashSet<MutationEvent>(); Map<ExtendedMutation, ExtendedMutation> mutations = new HashMap<ExtendedMutation, ExtendedMutation>(); long mutationEventId = DaoMutation.getLargestMutationEventId(); FileReader reader = new FileReader(mutationFile); BufferedReader buf = new BufferedReader(reader); DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); // The MAF File Changes fairly frequently, and we cannot use column index constants. String line = buf.readLine(); while (line.startsWith("#")) { line = buf.readLine(); // skip comments/meta info } line = line.trim(); MafUtil mafUtil = new MafUtil(line); boolean fileHasOMAData = false; if (mafUtil.getMaFImpactIndex() >= 0) { // fail gracefully if a non-essential column is missing // e.g. if there is no MA_link.var column, we assume that the value is NA and insert it as such fileHasOMAData = true; ProgressMonitor.setCurrentMessage(" --> OMA Scores Column Number: " + mafUtil.getMaFImpactIndex()); } else { fileHasOMAData = false; } GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); while ((line = buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); if (!line.startsWith("#") && line.trim().length() > 0) { String[] parts = line.split("\t", -1); // the -1 keeps trailing empty strings; see JavaDoc for String MafRecord record = mafUtil.parseRecord(line); // process case id String barCode = record.getTumorSampleID(); // backwards compatible part (i.e. in the new process, the sample should already be there. TODO - replace this workaround later with an exception: Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), StableIdUtil.getSampleId(barCode)); if (sample == null) { ImportDataUtil.addPatients(new String[] { barCode }, geneticProfileId); // add the sample (except if it is a 'normal' sample): ImportDataUtil.addSamples(new String[] { barCode }, geneticProfileId); } // check again (repeated because of workaround above): sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), StableIdUtil.getSampleId(barCode)); // can be null in case of 'normal' sample: if (sample == null) { assert StableIdUtil.isNormal(barCode); //if new sample: if (sampleSet.add(barCode)) samplesSkipped++; continue; } if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId); } String validationStatus = record.getValidationStatus(); if (validationStatus == null || validationStatus.equalsIgnoreCase("Wildtype")) { ProgressMonitor.logWarning("Skipping entry with Validation_Status: Wildtype"); entriesSkipped++; continue; } String chr = DaoGeneOptimized.normalizeChr(record.getChr().toUpperCase()); if (chr == null) { ProgressMonitor.logWarning("Skipping entry with chromosome value: " + record.getChr()); entriesSkipped++; continue; } record.setChr(chr); if (record.getStartPosition() < 0) record.setStartPosition(0); if (record.getEndPosition() < 0) record.setEndPosition(0); String functionalImpactScore = ""; // using -1 is not safe, FIS can be a negative value Float fisValue = Float.MIN_VALUE; String linkXVar = ""; String linkMsa = ""; String linkPdb = ""; if (fileHasOMAData) { // functionalImpactScore = getField(parts, "MA:FImpact" ); // fisValue = getField(parts, "MA:FIS"); // linkXVar = getField(parts, "MA:link.var" ); // linkMsa = getField(parts, "MA:link.MSA" ); // linkPdb = getField(parts, "MA:link.PDB" ); functionalImpactScore = record.getMaFuncImpact(); fisValue = record.getMaFIS(); linkXVar = record.getMaLinkVar(); linkMsa = record.getMaLinkMsa(); linkPdb = record.getMaLinkPdb(); functionalImpactScore = transformOMAScore(functionalImpactScore); linkXVar = linkXVar.replace("\"", ""); } String mutationType, proteinChange, aaChange, codonChange, refseqMrnaId, uniprotName, uniprotAccession; int proteinPosStart, proteinPosEnd; // determine whether to use canonical or best effect transcript // try canonical first if (ExtendedMutationUtil.isAcceptableMutation(record.getVariantClassification())) { mutationType = record.getVariantClassification(); } // if not acceptable either, use the default value else { mutationType = ExtendedMutationUtil.getMutationType(record); } // skip RNA mutations if (mutationType != null && mutationType.equalsIgnoreCase("rna")) { ProgressMonitor.logWarning("Skipping entry with mutation type: RNA"); entriesSkipped++; continue; } proteinChange = ExtendedMutationUtil.getProteinChange(parts, record); //proteinChange = record.getProteinChange(); aaChange = record.getAminoAcidChange(); codonChange = record.getCodons(); refseqMrnaId = record.getRefSeq(); uniprotName = record.getSwissprot(); uniprotAccession = DaoUniProtIdMapping.mapFromUniprotIdToAccession(record.getSwissprot()); proteinPosStart = ExtendedMutationUtil.getProteinPosStart(record.getProteinPosition(), proteinChange); proteinPosEnd = ExtendedMutationUtil.getProteinPosEnd(record.getProteinPosition(), proteinChange); // Assume we are dealing with Entrez Gene Ids (this is the best / most stable option) String geneSymbol = record.getHugoGeneSymbol(); String entrezIdString = record.getGivenEntrezGeneId(); CanonicalGene gene = null; // try to parse entrez if it is not empty nor 0: if (!(entrezIdString.isEmpty() || entrezIdString.equals("0"))) { Long entrezGeneId; try { entrezGeneId = Long.parseLong(entrezIdString); } catch (NumberFormatException e) { entrezGeneId = null; } //non numeric values or negative values should not be allowed: if (entrezGeneId == null || entrezGeneId < 0) { ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrezIdString); entriesSkipped++; continue; } else { gene = daoGene.getGene(entrezGeneId); if (gene == null) { //skip if not in DB: ProgressMonitor.logWarning( "Entrez gene ID " + entrezGeneId + " not found. Record will be skipped."); entriesSkipped++; continue; } } } // If Entrez Gene ID Fails, try Symbol. if (gene == null && !(geneSymbol.equals("") || geneSymbol.equals("Unknown"))) { gene = daoGene.getNonAmbiguousGene(geneSymbol, chr); } // assume symbol=Unknown and entrez=0 (or missing Entrez column) to imply an // intergenic, irrespective of what the column Variant_Classification says if (geneSymbol.equals("Unknown") && (entrezIdString.equals("0") || mafUtil.getEntrezGeneIdIndex() == -1)) { // give extra warning if mutationType is something different from IGR: if (mutationType != null && !mutationType.equalsIgnoreCase("IGR")) { ProgressMonitor.logWarning("Treating mutation with gene symbol 'Unknown' " + (mafUtil.getEntrezGeneIdIndex() == -1 ? "" : "and Entrez gene ID 0") + " as intergenic ('IGR') " + "instead of '" + mutationType + "'. Entry filtered/skipped."); } // treat as IGR: myMutationFilter.decisions++; myMutationFilter.igrRejects++; // skip entry: entriesSkipped++; continue; } // skip the record if a gene was expected but not identified if (gene == null) { ProgressMonitor.logWarning("Ambiguous or missing gene: " + geneSymbol + " [" + record.getGivenEntrezGeneId() + "] or ambiguous alias. Ignoring it " + "and all mutation data associated with it!"); entriesSkipped++; continue; } else { ExtendedMutation mutation = new ExtendedMutation(); mutation.setGeneticProfileId(geneticProfileId); mutation.setSampleId(sample.getInternalId()); mutation.setGene(gene); mutation.setSequencingCenter(record.getCenter()); mutation.setSequencer(record.getSequencer()); mutation.setProteinChange(proteinChange); mutation.setAminoAcidChange(aaChange); mutation.setMutationType(mutationType); mutation.setChr(record.getChr()); mutation.setStartPosition(record.getStartPosition()); mutation.setEndPosition(record.getEndPosition()); mutation.setValidationStatus(record.getValidationStatus()); mutation.setMutationStatus(record.getMutationStatus()); mutation.setFunctionalImpactScore(functionalImpactScore); mutation.setFisValue(fisValue); mutation.setLinkXVar(linkXVar); mutation.setLinkPdb(linkPdb); mutation.setLinkMsa(linkMsa); mutation.setNcbiBuild(record.getNcbiBuild()); mutation.setStrand(record.getStrand()); mutation.setVariantType(record.getVariantType()); mutation.setAllele(record.getTumorSeqAllele1(), record.getTumorSeqAllele2(), record.getReferenceAllele()); mutation.setDbSnpRs(record.getDbSNP_RS()); mutation.setDbSnpValStatus(record.getDbSnpValStatus()); mutation.setMatchedNormSampleBarcode(record.getMatchedNormSampleBarcode()); mutation.setMatchNormSeqAllele1(record.getMatchNormSeqAllele1()); mutation.setMatchNormSeqAllele2(record.getMatchNormSeqAllele2()); mutation.setTumorValidationAllele1(record.getTumorValidationAllele1()); mutation.setTumorValidationAllele2(record.getTumorValidationAllele2()); mutation.setMatchNormValidationAllele1(record.getMatchNormValidationAllele1()); mutation.setMatchNormValidationAllele2(record.getMatchNormValidationAllele2()); mutation.setVerificationStatus(record.getVerificationStatus()); mutation.setSequencingPhase(record.getSequencingPhase()); mutation.setSequenceSource(record.getSequenceSource()); mutation.setValidationMethod(record.getValidationMethod()); mutation.setScore(record.getScore()); mutation.setBamFile(record.getBamFile()); mutation.setTumorAltCount(ExtendedMutationUtil.getTumorAltCount(record)); mutation.setTumorRefCount(ExtendedMutationUtil.getTumorRefCount(record)); mutation.setNormalAltCount(ExtendedMutationUtil.getNormalAltCount(record)); mutation.setNormalRefCount(ExtendedMutationUtil.getNormalRefCount(record)); // TODO rename the oncotator column names (remove "oncotator") mutation.setOncotatorCodonChange(codonChange); mutation.setOncotatorRefseqMrnaId(refseqMrnaId); mutation.setOncotatorUniprotName(uniprotName); mutation.setOncotatorUniprotAccession(uniprotAccession); mutation.setOncotatorProteinPosStart(proteinPosStart); mutation.setOncotatorProteinPosEnd(proteinPosEnd); // TODO we don't use this info right now... mutation.setCanonicalTranscript(true); sequencedCaseSet.add(sample.getStableId()); // Filter out Mutations if (myMutationFilter.acceptMutation(mutation)) { MutationEvent event = existingEvents.get(mutation.getEvent()); if (event != null) { mutation.setEvent(event); } else { mutation.setMutationEventId(++mutationEventId); existingEvents.put(mutation.getEvent(), mutation.getEvent()); newEvents.add(mutation.getEvent()); } ExtendedMutation exist = mutations.get(mutation); if (exist != null) { ExtendedMutation merged = mergeMutationData(exist, mutation); mutations.put(merged, merged); } else { mutations.put(mutation, mutation); } //keep track: sampleSet.add(barCode); geneSet.add(mutation.getEntrezGeneId() + ""); } else { entriesSkipped++; } } } } for (MutationEvent event : newEvents) { try { DaoMutation.addMutationEvent(event); } catch (DaoException ex) { throw ex; } } for (ExtendedMutation mutation : mutations.values()) { try { DaoMutation.addMutation(mutation, false); } catch (DaoException ex) { throw ex; } } if (MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); } // calculate mutation count for every sample DaoMutation.calculateMutationCount(geneticProfileId); if (entriesSkipped > 0) { ProgressMonitor.setCurrentMessage( " --> total number of data entries skipped (see table below): " + entriesSkipped); } ProgressMonitor.setCurrentMessage(" --> total number of samples: " + sampleSet.size()); if (samplesSkipped > 0) { ProgressMonitor .setCurrentMessage(" --> total number of samples skipped (normal samples): " + samplesSkipped); } ProgressMonitor.setCurrentMessage( " --> total number of genes for which one or more mutation events were stored: " + geneSet.size()); ProgressMonitor.setCurrentMessage("Filtering table:\n-----------------"); ProgressMonitor.setCurrentMessage(myMutationFilter.getStatistics()); } /** * merge the current mutation * @return */ private ExtendedMutation mergeMutationData(ExtendedMutation mut1, ExtendedMutation mut2) { ExtendedMutation ret = mut1; if (!mut1.getMatchedNormSampleBarcode().equalsIgnoreCase(mut2.getMatchedNormSampleBarcode())) { if (mut2.getMatchedNormSampleBarcode().matches("TCGA-..-....-10.*")) { // select blood normal if available ret = mut2; } } else if (!mut1.getValidationStatus().equalsIgnoreCase(mut2.getValidationStatus())) { if (mut2.getValidationStatus().equalsIgnoreCase("Valid") || mut2.getValidationStatus().equalsIgnoreCase("VALIDATED")) { // select validated mutations ret = mut2; } } else if (!mut1.getMutationStatus().equalsIgnoreCase(mut2.getMutationStatus())) { if (mut2.getMutationStatus().equalsIgnoreCase("Germline")) { // select germline over somatic ret = mut2; } else if (mut2.getMutationStatus().equalsIgnoreCase("SOMATIC")) { if (!mut1.getMutationStatus().equalsIgnoreCase("Germline")) { // select somatic over others ret = mut2; } } } // merge centers Set<String> centers = new TreeSet<String>(Arrays.asList(mut1.getSequencingCenter().split(";"))); if (centers.addAll(Arrays.asList(mut2.getSequencingCenter().split(";")))) { if (centers.size() > 1) { centers.remove("NA"); } ret.setSequencingCenter(StringUtils.join(centers, ";")); } return ret; } private String transformOMAScore(String omaScore) { if (omaScore == null || omaScore.length() == 0) { return omaScore; } if (omaScore.equalsIgnoreCase("H") || omaScore.equalsIgnoreCase("high")) { return "H"; } else if (omaScore.equalsIgnoreCase("M") || omaScore.equalsIgnoreCase("medium")) { return "M"; } else if (omaScore.equalsIgnoreCase("L") || omaScore.equalsIgnoreCase("low")) { return "L"; } else if (omaScore.equalsIgnoreCase("N") || omaScore.equalsIgnoreCase("neutral")) { return "N"; } else if (omaScore.equalsIgnoreCase("[sent]")) { return ExtendedMutationUtil.NOT_AVAILABLE; // TODO temp workaround for invalid sent values } else { return omaScore; } } }