Java tutorial
/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.amt; import java.util.*; import java.util.List; import java.io.IOException; import java.io.File; import org.apache.log4j.Logger; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.BaseFeatureSetMatcherImpl; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.ClusteringFeatureSetMatcher; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.Window2DFeatureSetMatcher; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef; import org.fhcrc.cpl.viewer.ms2.Fractionation2DUtilities; import org.fhcrc.cpl.toolbox.proteomics.ProteomicsRegressionUtilities; import org.fhcrc.cpl.toolbox.*; import org.fhcrc.cpl.toolbox.statistics.BasicStatistics; import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities; import org.fhcrc.cpl.toolbox.filehandler.TempFileManager; import org.fhcrc.cpl.toolbox.datastructure.Pair; import org.fhcrc.cpl.toolbox.gui.chart.*; import org.fhcrc.cpl.toolbox.proteomics.MS2Modification; import org.fhcrc.cpl.toolbox.proteomics.ModifiedAminoAcid; import org.jfree.chart.JFreeChart; import javax.swing.*; /** * This class performs simple matching between an AMT database and * a feature files. It constructs a probability model for the matches * based on mass and NRT deviation. */ public class AmtDatabaseMatcher { static Logger _log = Logger.getLogger(AmtDatabaseMatcher.class); //These defaults are for robust regression //default value for the cutoff numerator in the leverage cutoff public static final double DEFAULT_LEVERAGE_NUMERATOR = 6; public static final double DEFAULT_MAX_LEVERAGE_NUMERATOR = 12; public static final double DEFAULT_MAX_STUDENTIZED_RESIDUAL = 3.0; public static final float DEFAULT_MASS_MATCH_DELTA_MASS = 5; public static final int DEFAULT_MASS_MATCH_DELTA_MASS_TYPE = FeatureSetMatcher.DELTA_MASS_TYPE_PPM; public static final float DEFAULT_2D_MATCH_DELTA_MASS = 10; public static final int DEFAULT_2D_MATCH_DELTA_MASS_TYPE = FeatureSetMatcher.DELTA_MASS_TYPE_PPM; public static final float DEFAULT_2D_MATCH_DELTA_ELUTION = 0.15f; //minimum number of matches required to perform a regression. This is probably not conservative enough public static final int MIN_MATCHED_FEATURES_FOR_REGRESSION = 8; //minimum number of feature pairs that must be passed to Yan's //quantile regression code to get it to work. public static final int DEFAULT_QUANTILE_REG_MIN_FEATURES = 85; protected int quantileRegressionMinFeatures = DEFAULT_QUANTILE_REG_MIN_FEATURES; //default degree of the polynomial to fit when mapping T to H nonlinearly public static final int DEFAULT_NONLINEAR_MAPPING_DEGREE = 5; protected int nonlinearMappingPolynomialDegree = DEFAULT_NONLINEAR_MAPPING_DEGREE; //what's a 'significant' difference in hydrophobicity, in terms of figuring //out whether a single-observation peptide should be used for matching? feat public static final float DEFAULT_SIGNIFICANT_HYDRO_DIFFERENCE = .4f; //These variables are for the mass-only match that we use in order to come up //with (ms1,amt) pairs for alignment if we have no embedded MS2 protected float massMatchDeltaMass = DEFAULT_MASS_MATCH_DELTA_MASS; protected int massMatchDeltaMassType = DEFAULT_MASS_MATCH_DELTA_MASS_TYPE; //These parameters are used in the actual ('loose') match protected float realMatchDeltaMass = DEFAULT_2D_MATCH_DELTA_MASS; protected int realMatchDeltaMassType = DEFAULT_2D_MATCH_DELTA_MASS_TYPE; protected float realMatchDeltaElution = DEFAULT_2D_MATCH_DELTA_ELUTION; //Controls the number of iterations for EM model protected int minEMIterations = AmtMatchProbabilityAssigner.DEFAULT_MIN_EM_ITERATIONS; protected int maxEMIterations = AmtMatchProbabilityAssigner.DEFAULT_MAX_EM_ITERATIONS; protected int maxRProbAssignmentMillis = AmtMatchProbabilityAssigner.DEFAULT_MAX_EM_ITERATIONS; //Should we use MS1 times for alignment? This requires mass-and-time matching between MS1 and MS2 public static final boolean DEFAULT_USE_MS1_TIMES_FOR_ALIGNMENT = true; protected boolean useMs1TimesForAlignment = DEFAULT_USE_MS1_TIMES_FOR_ALIGNMENT; //tolerances for matching MS1 features to MS2 for alignment protected float ms1Ms2MassTolerancePPM = AmtDatabaseBuilder.DEFAULT_MS1_MS2_MASS_TOLERANCE_PPM; protected float ms1Ms2TimeToleranceSeconds = AmtDatabaseBuilder.DEFAULT_MS1_MS2_TIME_TOLERANCE_SECONDS; //if this is true, we do a dummy match instead of a real match protected boolean doDecoyMatch = false; //Parameters for robust regression protected double maxRegressionLeverageNumerator = DEFAULT_MAX_LEVERAGE_NUMERATOR; protected double maxRegressionStudRes = DEFAULT_MAX_STUDENTIZED_RESIDUAL; protected float minMatchProbabilityToKeep = AmtMatchProbabilityAssigner.DEFAULT_MIN_MATCH_PROBABILITY; protected float maxMatchFDRToKeep = AmtMatchProbabilityAssigner.DEFAULT_MAX_MATCH_FDR; //Hard maximum on second-best probability protected float maxSecondBestProbability = AmtMatchProbabilityAssigner.DEFAULT_MAX_SECONDBEST_PROBABILITY; //Minimum difference between second-best probability and best probability protected float minSecondBestProbabilityDifference = AmtMatchProbabilityAssigner.DEFAULT_MIN_SECONDBEST_PROBABILITY_DIFFERENCE; //For creation of decoy database, used in determining probabilities public static final int DEFAULT_DECOY_DB_MASS_ADJUSTMENT_DA = 11; //Persisting the calculated mapping coefficients so they can be accessed elsewhere public double[] timeHydMapCoefficients = null; //control whether to display useful charts when matching to databases protected boolean buildCharts = false; //Stores the result of matching public FeatureSetMatcher.FeatureMatchingResult featureMatchingResult = null; //Persist charts related to matching, for later retrieval protected JFreeChart massMatchScatterplot = null; protected JFreeChart timeHydrophobicityMappingChart = null; protected JFreeChart massCalibrationChart = null; protected PanelWithRPerspectivePlot massTimeErrorPerspectivePlot = null; public JFreeChart massDeltaMassScatterPlot = null; //amount by which to adjust AMT feature masses. For dummy matching protected double amtFeatureMassAdjustment = 0; //Stores information about the AMT database's structure. For matching //fractionated data to fractionated data protected boolean amtDBDimensionsDefined = false; protected Fractionation2DUtilities.FractionatedAMTDatabaseStructure amtDatabaseStructure; protected AmtMatchProbabilityAssigner probabilityAssigner = null; public void matchWithFDRCalc(AmtDatabase amtDatabaseThisMatch, Feature[] amtDBFeaturesThisMatch, FeatureSet ms1FeatureSetToMatch, FeatureSet embeddedMs2FeatureSet, float minEmbeddedMs2PeptideProphet, MS2Modification[] ms2ModificationsArray, File matchingOutputFile, boolean removeFractions, int minRunsToKeep, int maxRunsToKeep, boolean calibrateMassesUsingMatches, boolean showCharts) throws IOException { float portionDecoy = .5f; int numTotalDBEntries = amtDatabaseThisMatch.numEntries(); int numDecoy = Math.round(portionDecoy * numTotalDBEntries); AmtPeptideEntry[] allEntries = amtDatabaseThisMatch.getEntries(); Set<String> decoyPeptides1 = new HashSet<String>(); //select decoy peptides while (decoyPeptides1.size() < numDecoy) { int entryIndex = (int) Math.round(Math.random() * (numTotalDBEntries - 1)); AmtPeptideEntry entry = allEntries[entryIndex]; String peptide = entry.getPeptideSequence(); if (!decoyPeptides1.contains(peptide)) { decoyPeptides1.add(peptide); } } Set<String> decoyPeptides2 = new HashSet<String>(); for (AmtPeptideEntry peptideEntry : allEntries) { if (!decoyPeptides1.contains(peptideEntry.getPeptideSequence())) decoyPeptides2.add(peptideEntry.getPeptideSequence()); } Map<Feature, Float> firstHalfMatchesWithFDRs = matchWithPortionDecoy(amtDatabaseThisMatch, amtDBFeaturesThisMatch, ms1FeatureSetToMatch, embeddedMs2FeatureSet, minEmbeddedMs2PeptideProphet, ms2ModificationsArray, matchingOutputFile, removeFractions, minRunsToKeep, maxRunsToKeep, calibrateMassesUsingMatches, showCharts, decoyPeptides1, 1f); } /** * Change the masses of a random portion (portionDecoy) of the AMT * database entries to make them decoy entries, perform the match, * and report on the specificity of the match * @param amtDatabaseThisMatch * @param amtDBFeaturesThisMatch * @param ms1FeatureSetToMatch * @param embeddedMs2FeatureSet * @param minEmbeddedMs2PeptideProphet * @param ms2ModificationsArray * @param matchingOutputFile * @param removeFractions * @param minRunsToKeep * @param maxRunsToKeep * @param calibrateMassesUsingMatches * @param showCharts * @param decoyPeptides */ public Map<Feature, Float> matchWithPortionDecoy(AmtDatabase amtDatabaseThisMatch, Feature[] amtDBFeaturesThisMatch, FeatureSet ms1FeatureSetToMatch, FeatureSet embeddedMs2FeatureSet, float minEmbeddedMs2PeptideProphet, MS2Modification[] ms2ModificationsArray, File matchingOutputFile, boolean removeFractions, int minRunsToKeep, int maxRunsToKeep, boolean calibrateMassesUsingMatches, boolean showCharts, Set<String> decoyPeptides, float targetDecoyRatio) throws IOException { List<Feature> targetDecoyDBFeatures = new ArrayList<Feature>(); for (Feature feature : amtDBFeaturesThisMatch) { String peptide = MS2ExtraInfoDef.getFirstPeptide(feature); if (decoyPeptides.contains(peptide)) { Feature decoyFeature = new Feature(feature); decoyFeature.setMass(feature.getMass() + DEFAULT_DECOY_DB_MASS_ADJUSTMENT_DA); targetDecoyDBFeatures.add(decoyFeature); } else targetDecoyDBFeatures.add(feature); } List<Feature> matchedMS1Features = matchAgainstMs1(amtDatabaseThisMatch, targetDecoyDBFeatures.toArray(new Feature[targetDecoyDBFeatures.size()]), ms1FeatureSetToMatch, embeddedMs2FeatureSet, minEmbeddedMs2PeptideProphet, ms2ModificationsArray, matchingOutputFile, removeFractions, minRunsToKeep, maxRunsToKeep, calibrateMassesUsingMatches, showCharts); Collections.sort(matchedMS1Features, new PeptideProphetComparatorDesc()); int numTargetMatches = 0; int numDecoyMatches = 0; float[] fdrs = new float[matchedMS1Features.size()]; float[] probabilities = new float[matchedMS1Features.size()]; float[] sums1MinusProbs = new float[matchedMS1Features.size()]; float[] sumsProbs = new float[matchedMS1Features.size()]; float[] theoreticalFDRs = new float[matchedMS1Features.size()]; Map<Feature, Float> result = new HashMap<Feature, Float>(); float sum1MinusProbs = 0f; float sumProbs = 0f; for (int i = 0; i < matchedMS1Features.size(); i++) { Feature matchedMS1Feature = matchedMS1Features.get(i); boolean decoyMatch = decoyPeptides.contains(MS2ExtraInfoDef.getFirstPeptide(matchedMS1Feature)); if (decoyMatch) numDecoyMatches++; else numTargetMatches++; probabilities[i] = (float) MS2ExtraInfoDef.getPeptideProphet(matchedMS1Feature); float fdr = 1f; if (numTargetMatches > 0) fdr = targetDecoyRatio * numDecoyMatches / numTargetMatches; fdrs[i] = fdr; sum1MinusProbs += 1 - probabilities[i]; sums1MinusProbs[i] = sum1MinusProbs; sumProbs += probabilities[i]; sumsProbs[i] = sumProbs; theoreticalFDRs[i] = (sum1MinusProbs / (i + 1)); if (!decoyMatch) result.put(matchedMS1Feature, fdr); } if (showCharts) { PanelWithLineChart pwlc = new PanelWithLineChart(probabilities, fdrs, "Prob vs FDR"); pwlc.setAxisLabels("Match Probability", "FDR (#decoy/#target)"); pwlc.displayInTab(); PanelWithLineChart pwlc2 = new PanelWithLineChart(theoreticalFDRs, fdrs, "Theoretical FDR vs FDR"); pwlc2.setAxisLabels("Theoretical FDR: sum(1-p)/sum(1)", "Actual FDR (#decoy/#target)"); pwlc2.displayInTab(); } return result; } public static class PeptideProphetComparatorDesc implements Comparator<Feature> { public int compare(Feature o1, Feature o2) { double o1Value = MS2ExtraInfoDef.getPeptideProphet(o1); double o2Value = MS2ExtraInfoDef.getPeptideProphet(o2); if (o1Value == o2Value) return 0; return o1Value < o2Value ? 1 : -1; } } /** * For each peptide, per modification state, remove all but one observation. That observation * gets the median retention time of all of them * @param featureSet * @return */ public static void representPeptidesWithMedianTimePerPeptidePerMod(FeatureSet featureSet) { //map peptides to features HashMap<String, List<Feature>> peptideFeatureListMap = new HashMap<String, List<Feature>>(); Feature[] features = featureSet.getFeatures(); int initialNumFeatures = features.length; for (Feature feature : features) { String peptide = MS2ExtraInfoDef.getFirstPeptide(feature); if (peptide == null) continue; List<Feature> thisPeptideFeatureList = peptideFeatureListMap.get(peptide); if (thisPeptideFeatureList == null) { thisPeptideFeatureList = new ArrayList<Feature>(); peptideFeatureListMap.put(peptide, thisPeptideFeatureList); } thisPeptideFeatureList.add(feature); } List<Feature> resultFeatureList = new ArrayList<Feature>(); //for debugging int sumNumModStatesPerPeptide = 0; for (String peptide : peptideFeatureListMap.keySet()) { List<Feature> featuresThisPeptide = peptideFeatureListMap.get(peptide); //only exact mod state matches fold together Map<String, List<Feature>> modStateFeatureListMap = new HashMap<String, List<Feature>>(); for (Feature featureThisPeptide : featuresThisPeptide) { MS2ExtraInfoDef.getModifiedAminoAcids(featureThisPeptide); Map<Integer, List<ModifiedAminoAcid>> positionModListMap = (Map<Integer, List<ModifiedAminoAcid>>) featureThisPeptide .getProperty("modifiedaminoacids"); //special handling for no modifications String modStateString = ""; if (positionModListMap != null && positionModListMap.size() > 0) modStateString = MS2ExtraInfoDef.getSingletonInstance().convertToString("modifiedaminoacids", positionModListMap); List<Feature> featuresThisModState = modStateFeatureListMap.get(modStateString); if (featuresThisModState == null) { featuresThisModState = new ArrayList<Feature>(); modStateFeatureListMap.put(modStateString, featuresThisModState); } featuresThisModState.add(featureThisPeptide); } if (_log.isDebugEnabled()) sumNumModStatesPerPeptide += modStateFeatureListMap.size(); //add the first for each mod state for (String modStateString : modStateFeatureListMap.keySet()) { List<Feature> featuresThisModState = modStateFeatureListMap.get(modStateString); List<Float> retentionTimesThisModState = new ArrayList<Float>(); for (Feature feature : featuresThisModState) retentionTimesThisModState.add(feature.getTime()); float medianTime = (float) BasicStatistics.median(retentionTimesThisModState); int lastScan = 0; Feature firstFeature = null; for (Feature feature : featuresThisModState) { if (feature.getScan() > lastScan) lastScan = feature.getScan(); if (firstFeature == null || feature.getScan() < firstFeature.getScan()) firstFeature = feature; } firstFeature.setScanLast(lastScan); firstFeature.setTime(medianTime); resultFeatureList.add(firstFeature); } } //sort all the earliest features for each peptide & set them as the feature array //for this featureset Collections.sort(resultFeatureList, new Feature.MzScanAscComparator()); _log.debug("representPeptidesWithMedianTimePerPeptidePerMod, initial features: " + initialNumFeatures + ", resulting features: " + resultFeatureList.size()); featureSet.setFeatures(resultFeatureList.toArray(new Feature[0])); } /** * Match an AMT database against a single MS1 feature file. This is the master matching method * * @return a list of matched MS1 features */ public List<Feature> matchAgainstMs1(AmtDatabase amtDatabaseThisMatch, Feature[] amtDBFeaturesThisMatch, FeatureSet ms1FeatureSetToMatch, FeatureSet embeddedMs2FeatureSet, float minEmbeddedMs2PeptideProphet, MS2Modification[] ms2ModificationsArray, File matchingOutputFile, boolean removeFractions, int minRunsToKeep, int maxRunsToKeep, boolean calibrateMassesUsingMatches, boolean showCharts) throws IOException { boolean recordDBRunsInFeatureFile = false; ApplicationContext.setMessage("Matching against file " + ms1FeatureSetToMatch.getSourceFile().getName()); Feature[] guideFeaturesForAlignment = null; if (embeddedMs2FeatureSet != null) { FeatureSet.FeatureSelector sel = new FeatureSet.FeatureSelector(); sel.setMinPProphet(minEmbeddedMs2PeptideProphet); _log.debug("Embedded MS2: before filter, " + embeddedMs2FeatureSet.getFeatures().length + " features"); embeddedMs2FeatureSet = embeddedMs2FeatureSet.filter(sel); _log.debug("Embedded MS2: after filter, " + embeddedMs2FeatureSet.getFeatures().length + " features"); if (embeddedMs2FeatureSet.getFeatures().length < MIN_MATCHED_FEATURES_FOR_REGRESSION) throw new IllegalArgumentException( "ERROR: after filter, too few MS2 features (" + embeddedMs2FeatureSet.getFeatures().length + ") for alignment. You can try again with a " + "less restrictive cutoff."); boolean nonzeroMs2TimesExist = false; for (Feature feature : embeddedMs2FeatureSet.getFeatures()) { if (feature.getTime() > 0) { nonzeroMs2TimesExist = true; break; } } if (!nonzeroMs2TimesExist) { throw new IllegalArgumentException("ERROR! MS2 feature retention times are all zero! " + "Please populate feature " + "times, using the 'populatems2times' command, or provide associated mzXML file(s) to " + "use for populating scan times."); } if (useMs1TimesForAlignment) { representPeptidesWithMedianTimePerPeptidePerMod(embeddedMs2FeatureSet); Window2DFeatureSetMatcher featureSetMatcher = new Window2DFeatureSetMatcher(); featureSetMatcher.setMassDiffType(FeatureSetMatcher.DELTA_MASS_TYPE_PPM); featureSetMatcher.setMaxMassDiff(ms1Ms2MassTolerancePPM); featureSetMatcher.setMinMassDiff(-ms1Ms2MassTolerancePPM); featureSetMatcher.setMaxElutionDiff(ms1Ms2TimeToleranceSeconds); featureSetMatcher.setMinElutionDiff(-ms1Ms2TimeToleranceSeconds); featureSetMatcher.setElutionMode(BaseFeatureSetMatcherImpl.ELUTION_MODE_TIME); FeatureSetMatcher.FeatureMatchingResult ms1MS2MatchingResult = featureSetMatcher .matchFeatures(ms1FeatureSetToMatch, embeddedMs2FeatureSet); List<Feature> singlyMatchedMS1FeatureCopies = new ArrayList<Feature>(); for (Feature feature : ms1MS2MatchingResult.getMasterSetFeatures()) { List<Feature> ms2MatchedFeatures = ms1MS2MatchingResult.getSlaveSetFeatures(feature); Set<String> peptides = new HashSet<String>(); for (Feature ms2MatchedFeature : ms2MatchedFeatures) peptides.add(MS2ExtraInfoDef.getFirstPeptide(ms2MatchedFeature)); if (peptides.size() == 1) { Feature featureCopy = new Feature(feature); MS2ExtraInfoDef.setSinglePeptide(featureCopy, peptides.iterator().next()); singlyMatchedMS1FeatureCopies.add(featureCopy); } } guideFeaturesForAlignment = singlyMatchedMS1FeatureCopies .toArray(new Feature[singlyMatchedMS1FeatureCopies.size()]); ApplicationContext.infoMessage("Using MS1 features for alignment, matching to MS2. " + guideFeaturesForAlignment.length + " out of " + ms1FeatureSetToMatch.getFeatures().length + " MS1 features used for alignment"); } else { _log.debug("Using MS2 features for alignment"); MS2ExtraInfoDef.removeAllButFirstFeatureForEachPeptide(embeddedMs2FeatureSet); guideFeaturesForAlignment = embeddedMs2FeatureSet.getFeatures(); } _log.debug("Guide features for alignment: " + guideFeaturesForAlignment.length); } FeatureSet amtDatabaseFeatureSet = new FeatureSet(amtDBFeaturesThisMatch); calculateFeatureHydrophobicities(ms1FeatureSetToMatch.getFeatures(), guideFeaturesForAlignment, amtDatabaseFeatureSet, nonlinearMappingPolynomialDegree); if (showCharts) { JFreeChart nonlinearChart = getTimeHydrophobicityMappingChart(); PanelWithChart pwc = new PanelWithChart(nonlinearChart); pwc.setName("T->H Map"); pwc.displayInTab(); } //mess with the masses of all AMT database entries if (doDecoyMatch) { FeatureSet dummyDBFeatureSet = new FeatureSet(); List<Feature> dummyDBFeatures = new ArrayList<Feature>(); for (Feature oldDBFeature : amtDatabaseFeatureSet.getFeatures()) { Feature newFeature = new Feature(oldDBFeature); newFeature.setMass(newFeature.getMass() + DEFAULT_DECOY_DB_MASS_ADJUSTMENT_DA); dummyDBFeatures.add(newFeature); } dummyDBFeatureSet.setFeatures(dummyDBFeatures.toArray(new Feature[0])); amtDatabaseFeatureSet = dummyDBFeatureSet; } ApplicationContext .infoMessage("Loose matching with tolerances " + realMatchDeltaMass + ", " + realMatchDeltaElution); FeatureSetMatcher.FeatureMatchingResult looseMatchingResult = callWindowMatcher(ms1FeatureSetToMatch, amtDatabaseFeatureSet, -realMatchDeltaMass, realMatchDeltaMass, -realMatchDeltaElution, realMatchDeltaElution); //annotateFractionConcordanceForMatches(amtDatabaseThisMatch, embeddedMs2Features, looseMatchingResult); if (removeFractions) { amtDatabaseThisMatch = reduceDatabaseByRunSimilarity(amtDatabaseThisMatch, guideFeaturesForAlignment, looseMatchingResult, minRunsToKeep, maxRunsToKeep, showCharts); AmtDatabaseFeatureSetGenerator featureGenerator = new AmtDatabaseFeatureSetGenerator( amtDatabaseThisMatch); amtDBFeaturesThisMatch = featureGenerator.createFeaturesForModifications(ms2ModificationsArray); amtDatabaseFeatureSet = new FeatureSet(amtDBFeaturesThisMatch); calculateFeatureHydrophobicities(ms1FeatureSetToMatch.getFeatures(), guideFeaturesForAlignment, amtDatabaseFeatureSet, nonlinearMappingPolynomialDegree); if (showCharts) { JFreeChart nonlinearChart = getTimeHydrophobicityMappingChart(); PanelWithChart pwc = new PanelWithChart(nonlinearChart); pwc.setName("T->H Map after fraction removal"); pwc.displayInTab(); } looseMatchingResult = callWindowMatcher(ms1FeatureSetToMatch, amtDatabaseFeatureSet, -realMatchDeltaMass, realMatchDeltaMass, -realMatchDeltaElution, realMatchDeltaElution); } if (calibrateMassesUsingMatches) { ApplicationContext.setMessage( "Loose match before calibrate: " + looseMatchingResult.size() + " matches. Calibrating..."); calibrateMS1FeaturesWithMatches(ms1FeatureSetToMatch.getFeatures(), looseMatchingResult, showCharts); looseMatchingResult = callWindowMatcher(ms1FeatureSetToMatch, amtDatabaseFeatureSet, -realMatchDeltaMass, realMatchDeltaMass, -realMatchDeltaElution, realMatchDeltaElution); } if (showCharts) { List<Float> massErrors = new ArrayList<Float>(); List<Float> hErrors = new ArrayList<Float>(); for (Feature masterSetFeature : looseMatchingResult.getMasterSetFeatures()) { List<Feature> matchesThisFeature = looseMatchingResult.get(masterSetFeature); for (Feature matchedFeature : matchesThisFeature) { double deltaMass = (masterSetFeature.getMass() - matchedFeature.getMass()) * (1000000 / masterSetFeature.getMass()); double deltaH = (AmtExtraInfoDef.getObservedHydrophobicity(masterSetFeature) - (AmtExtraInfoDef.getObservedHydrophobicity(matchedFeature))); massErrors.add((float) deltaMass); hErrors.add((float) deltaH); } } PanelWithScatterPlot pspError = new PanelWithScatterPlot(hErrors, massErrors, "Loose match error data"); pspError.setAxisLabels("deltaNRT (NRT units)", "deltaMass (ppm)"); pspError.setPointSize(2); pspError.displayInTab(); } Feature[] amtDecoyFeatures = new Feature[amtDBFeaturesThisMatch.length]; for (int i = 0; i < amtDecoyFeatures.length; i++) { amtDecoyFeatures[i] = new Feature(amtDBFeaturesThisMatch[i]); amtDecoyFeatures[i].setMass( amtDecoyFeatures[i].getMass() + AmtDatabaseMatcher.DEFAULT_DECOY_DB_MASS_ADJUSTMENT_DA); } FeatureSet amtDecoyFeatureSet = new FeatureSet(amtDecoyFeatures); FeatureSetMatcher.FeatureMatchingResult decoyMatchingResult = callWindowMatcher(ms1FeatureSetToMatch, amtDecoyFeatureSet, -realMatchDeltaMass, realMatchDeltaMass, -realMatchDeltaElution, realMatchDeltaElution); ApplicationContext.setMessage("Loose match: " + looseMatchingResult.size() + " matches. " + "Decoy: " + decoyMatchingResult.size() + " matches"); probabilityAssigner = new AmtMatchProbabilityAssigner(-realMatchDeltaMass, realMatchDeltaMass, -realMatchDeltaElution, realMatchDeltaElution, minMatchProbabilityToKeep, maxMatchFDRToKeep); probabilityAssigner.setMaxSecondBestProbability(maxSecondBestProbability); probabilityAssigner.setMinSecondBestProbabilityDifference(minSecondBestProbabilityDifference); probabilityAssigner.setMinEMIterations(minEMIterations); probabilityAssigner.setMaxEMIterations(maxEMIterations); probabilityAssigner.setMaxRProbAssignmentMillis(maxRProbAssignmentMillis); List<Feature> matchedMS1Features = probabilityAssigner.assignMatchesAndProbabilities(looseMatchingResult, decoyMatchingResult, showCharts); ms1FeatureSetToMatch.addExtraInformationType(AmtExtraInfoDef.getSingletonInstance()); if (matchingOutputFile != null) _log.debug("\toutput file " + matchingOutputFile.getName()); _log.debug("temp dir = " + TempFileManager.getTmpDir().getAbsolutePath()); Set<String> thisRunMatchedPeptides = new HashSet<String>(); for (Feature feature : ms1FeatureSetToMatch.getFeatures()) { List<String> peptidesThisFeature = MS2ExtraInfoDef.getPeptideList(feature); if (peptidesThisFeature == null) continue; for (String peptide : peptidesThisFeature) { thisRunMatchedPeptides.add(peptide); } } ApplicationContext.infoMessage(" (matched " + thisRunMatchedPeptides.size() + " distinct peptides)"); ApplicationContext.infoMessage( (100 * (double) thisRunMatchedPeptides.size() / (double) amtDatabaseThisMatch.numEntries()) + " % of database"); // allMatchedPeptides.addAll(thisRunMatchedPeptides); File amtDBFile = amtDatabaseThisMatch.getAmtDBSourceFile(); String amtDBFileName = ""; if (amtDBFile != null) amtDBFileName = amtDBFile.getName(); AmtExtraInfoDef.setFeatureSetMatchedDatabaseName(ms1FeatureSetToMatch, amtDBFileName); ms1FeatureSetToMatch.addExtraInformationType(MS2ExtraInfoDef.getSingletonInstance()); if (ms2ModificationsArray != null) { MS2ExtraInfoDef.setFeatureSetModifications(ms1FeatureSetToMatch, ms2ModificationsArray); } if (recordDBRunsInFeatureFile) { AmtRunEntry[] runs = amtDatabaseThisMatch.getRuns(); List<String> runsMatchedList = new ArrayList<String>(runs.length); for (AmtRunEntry run : runs) runsMatchedList.add(run.getPepXmlFilename()); AmtExtraInfoDef.setFeatureSetRunsMatched(ms1FeatureSetToMatch, runsMatchedList); } if (matchingOutputFile != null) { try { ms1FeatureSetToMatch.save(matchingOutputFile); } catch (Exception e) { ApplicationContext.errorMessage("Error saving output featureset", e); } } return matchedMS1Features; } /** * Perform a mass-only match between MS1 features and AMT database. This is * for determining alignment parameters when no embedded MS2 are available * @param amtDatabaseFeatureSet * @param ms1Features * @param massMatchDeltaMass * @param massMatchDeltaMassType * @return */ public Pair<Feature, Feature>[] matchOnMassOnly(FeatureSet amtDatabaseFeatureSet, Feature[] ms1Features, float massMatchDeltaMass, int massMatchDeltaMassType) { //deltaElution has to be set to a hugely high number. And it must not be set to //900000 (five zeroes), because that is a sentinel value. Booo. ClusteringFeatureSetMatcher massOnlyFeatureSetMatcher = new ClusteringFeatureSetMatcher(massMatchDeltaMass, massMatchDeltaMassType, 9000000); massOnlyFeatureSetMatcher.setElutionMode(BaseFeatureSetMatcherImpl.ELUTION_MODE_TIME); massOnlyFeatureSetMatcher.setMassBucketIncrement(1); massOnlyFeatureSetMatcher.setNumMassBuckets(Math.round(massMatchDeltaMass) - 1); massOnlyFeatureSetMatcher.setNumElutionBuckets(1); FeatureSetMatcher.FeatureMatchingResult massMatchingResult = massOnlyFeatureSetMatcher .matchFeatures(new FeatureSet(ms1Features), amtDatabaseFeatureSet); Set<Feature> ms1MatchedFeatures = massMatchingResult.getMasterSetFeatures(); int i = 0; Pair<Feature, Feature>[] result = (Pair<Feature, Feature>[]) new Pair[ms1MatchedFeatures.size()]; for (Feature ms1Feature : ms1MatchedFeatures) { result[i++] = new Pair<Feature, Feature>(ms1Feature, massMatchingResult.getBestMatch(ms1Feature)); } return result; } /** * Cover method * @param amtDatabaseFeatureSet * @param ms1Features * @param degree * @return */ public double[] calculateTHMapCoefficientsWithMassMatching(FeatureSet amtDatabaseFeatureSet, Feature[] ms1Features, int degree) { return calculateTHMapCoefficientsWithMassMatching(amtDatabaseFeatureSet, ms1Features, degree, false); } /** * Use mass-only matching to calculate a time-hydrophobicity map. * * First we perform a simple mass-only matching. Plotted on a time-hydrophobicity * chart, this will generally produce a uniformly distributed, diffuse noise of * false positives, with a high-density line of true positives somewhere in the * middle. We then look for that line. * * Simple regression won't do, because of all the noise -- it will tend to get * dragged down unpredictably. We use Yan's modal regression to determine the * high-density line. This requires an R installation, with the quantreg package * installed. * * if onlyUseSamePeptideMatches is true, then we're doing same-peptide matching, * but still checking mass (because of potential effects on RT due to modifications) * * @param amtDatabaseFeatureSet * @param features * @param degree * @return */ public double[] calculateTHMapCoefficientsWithMassMatching(FeatureSet amtDatabaseFeatureSet, Feature[] features, int degree, boolean onlyUseSamePeptideMatches) { _log.debug("calculateTHMapCoefficientsWithMassMatching, db features: " + amtDatabaseFeatureSet.getFeatures().length + ", other features: " + features.length); Pair<Feature, Feature>[] massMatchedFeatures = matchOnMassOnly(amtDatabaseFeatureSet, features, massMatchDeltaMass, massMatchDeltaMassType); if (buildCharts) { ////This chart is maybe overkill // double[] deltaMassesPPM = new double[massMatchedFeatures.length]; // for (int i=0; i<deltaMassesPPM.length; i++) // { // deltaMassesPPM[i] = (massMatchedFeatures[i].second.getMass() - // massMatchedFeatures[i].first.getMass()) * // 1000000 / massMatchedFeatures[i].first.getMass(); // } // PanelWithHistogram deltaMassHist = new PanelWithHistogram(deltaMassesPPM); // deltaMassHist.setName("massMatchDeltaPPM"); // deltaMassHist.displayInTab(); } _log.debug("calculateTHMapCoefficientsWithMassMatching, mass matches = " + massMatchedFeatures.length); if (onlyUseSamePeptideMatches) { List<Pair<Feature, Feature>> restrictedMassMatches = new ArrayList<Pair<Feature, Feature>>(); Comparator<Pair<Feature, Feature>> ms2TimeFeaturePairComparatorAsc = new Comparator<Pair<Feature, Feature>>() { public int compare(Pair<Feature, Feature> o1, Pair<Feature, Feature> o2) { float scan1 = o1.first.getScan(); float scan2 = o2.first.getScan(); return (scan1 > scan2 ? 1 : scan2 > scan1 ? -1 : 0); } }; Arrays.sort(massMatchedFeatures, ms2TimeFeaturePairComparatorAsc); Set<String> alreadyMatchedPeptides = new HashSet<String>(); for (Pair<Feature, Feature> featurePair : massMatchedFeatures) { String firstPeptide = MS2ExtraInfoDef.getFirstPeptide(featurePair.first); if (firstPeptide == null) continue; if (firstPeptide.equals(MS2ExtraInfoDef.getFirstPeptide(featurePair.second))) { if (alreadyMatchedPeptides.contains(firstPeptide)) continue; alreadyMatchedPeptides.add(firstPeptide); restrictedMassMatches.add(featurePair); } } massMatchedFeatures = restrictedMassMatches.toArray(new Pair[restrictedMassMatches.size()]); _log.debug("Restricted matches by peptide"); } _log.debug("calculateTHMapWithMassMatching: mass matches: " + massMatchedFeatures.length); return calculateTHMapCoefficientsWithMatchedFeatures(massMatchedFeatures, degree); } /** * Given pairs of matched features, map RT to H * @param matchedFeatures * @param degree * @return */ public double[] calculateTHMapCoefficientsWithMatchedFeatures(Pair<Feature, Feature>[] matchedFeatures, int degree) { if (matchedFeatures.length < MIN_MATCHED_FEATURES_FOR_REGRESSION) { ApplicationContext.infoMessage("ERROR: Insufficient mass-matched features ( " + matchedFeatures.length + ") to build T->H map. If you're " + "seeing this error, it means that mass-matching features to the database in order to build the " + "map didn't go well. You can try increasing your mass tolerance with the 'massmatchdeltamass' " + "parameter. Please also make sure you have specified the appropriate modifications for your " + "MS1 features using the 'modifications' argument. Best of all would be to provide embedded " + "MS/MS search results in PepXML format, with the 'embeddedms2' argument."); return null; } //create a t->H mapping for this MS1 set based on the matching results double[] ms1Times = new double[matchedFeatures.length]; double[] amtHydrophobicities = new double[matchedFeatures.length]; Feature[] dummyFeatures = new Feature[matchedFeatures.length]; int i = 0; boolean nonzeroMs1TimesExist = false; for (Pair<Feature, Feature> matchedPair : matchedFeatures) { Feature ms1Feature = matchedPair.first; Feature matchedMs2Feature = matchedPair.second; Feature dummyFeature = (Feature) ms1Feature.clone(); ms1Times[i] = ms1Feature.getTime(); if (ms1Times[i] > 0) nonzeroMs1TimesExist = true; amtHydrophobicities[i] = AmtExtraInfoDef.getObservedHydrophobicity(matchedMs2Feature); AmtExtraInfoDef.setObservedHydrophobicity(dummyFeature, amtHydrophobicities[i]); dummyFeatures[i] = dummyFeature; i++; } if (!nonzeroMs1TimesExist) throw new IllegalArgumentException( "ERROR! MS1 feature retention times are all zero! " + "Please populate feature " + "times, using the 'populatems2times' command, or provide associated mzXML file(s) to " + "use for populating scan times."); //ScatterPlotDialog spdt = new ScatterPlotDialog(ms1Times, amtHydrophobicities, "mass-only matches"); //spdt.setVisible(true); double[] resultCoefficients; Feature[] featuresForRegression = dummyFeatures; double[] ms1TimesForRegression = null; double[] hydrophobicitiesForRegression = null; //The last argument has a big effect on the initial regression results, and thus on what //datapoints get excluded featuresForRegression = ProteomicsRegressionUtilities.selectFeaturesWithLowLeverageAndStudentizedResidual( dummyFeatures, ms1Times, amtHydrophobicities, maxRegressionLeverageNumerator, maxRegressionStudRes, false, 1, false, true); ApplicationContext.infoMessage("Using " + featuresForRegression.length + " features (out of " + matchedFeatures.length + " mass-matched) for regression"); ms1TimesForRegression = new double[featuresForRegression.length]; hydrophobicitiesForRegression = new double[featuresForRegression.length]; for (int j = 0; j < featuresForRegression.length; j++) { ms1TimesForRegression[j] = featuresForRegression[j].getTime(); hydrophobicitiesForRegression[j] = AmtExtraInfoDef.getObservedHydrophobicity(featuresForRegression[j]); } try { resultCoefficients = RegressionUtilities.modalRegression(ms1TimesForRegression, hydrophobicitiesForRegression, degree); } catch (IOException e) { e.printStackTrace(System.err); //if we failed, it could be because of timeout, or because of a missing //quantreg package, or...? //todo: move text to bundle throw new RuntimeException("ERROR: Failure calling R for modal regression. R may have timed out.\n" + "This may also be because the required \"quantreg\" package is not installed.\n" + " To install this package, run the following lines in R:\n" + "source(\"http://bioconductor.org/biocLite.R\")\n" + "biocLite(c(\"quantreg\"))"); } _log.debug("Mapping coefficients:"); for (int j = 0; j < resultCoefficients.length; j++) { _log.debug("\tDegree " + j + ": " + resultCoefficients[j]); } if (buildCharts) { int maxTime = 0; int minTime = Integer.MAX_VALUE; for (double ms1Time : ms1Times) { if (ms1Time > maxTime) maxTime = (int) ms1Time; if (ms1Time < minTime) minTime = (int) ms1Time; } PanelWithScatterPlot psp = new PanelWithScatterPlot(); for (int j = 0; j < featuresForRegression.length; j++) { Feature feature = featuresForRegression[j]; ms1TimesForRegression[j] = feature.getTime(); hydrophobicitiesForRegression[j] = AmtExtraInfoDef.getObservedHydrophobicity(feature); } int numDotsOnChart = (maxTime - minTime + 1) / 2; double[] chartXVals = new double[numDotsOnChart]; double[] chartYVals = new double[numDotsOnChart]; for (int j = 0; j < numDotsOnChart; j++) { chartXVals[j] = minTime + (2 * j); chartYVals[j] = RegressionUtilities.mapValueUsingCoefficients(resultCoefficients, chartXVals[j]); } psp.addData(ms1TimesForRegression, hydrophobicitiesForRegression, "Matches used in regression"); psp.addData(ms1Times, amtHydrophobicities, "all mass matches"); psp.addData(chartXVals, chartYVals, "regression function"); psp.setAxisLabels("MS1 time", "AMT Hydrophobicity"); timeHydrophobicityMappingChart = psp.getChart(); } return resultCoefficients; } /** * Outermost method for calculating feature hydrophobicities. Delegator. * @param ms1Features * @param alignmentGuideFeatures * @param amtDatabaseFeatureSet * @param degree */ public void calculateFeatureHydrophobicities(Feature[] ms1Features, Feature[] alignmentGuideFeatures, FeatureSet amtDatabaseFeatureSet, int degree) { if (alignmentGuideFeatures == null) { _log.debug("Using mass-only matching to map time to hydrophobicity. Deltamass: " + massMatchDeltaMass + ", delta mass type: " + massMatchDeltaMassType); timeHydMapCoefficients = calculateTHMapCoefficientsWithMassMatching(amtDatabaseFeatureSet, ms1Features, degree, false); } else { timeHydMapCoefficients = calculateTHMapCoefficientsWithMassMatching(amtDatabaseFeatureSet, alignmentGuideFeatures, degree, true); } int maxTimePlusOne = 0; for (Feature feature : ms1Features) { maxTimePlusOne = Math.max(maxTimePlusOne, (int) (feature.getTime() + 1)); } for (Feature feature : ms1Features) { AmtExtraInfoDef.setObservedHydrophobicity(feature, RegressionUtilities.mapValueUsingCoefficients(timeHydMapCoefficients, feature.getTime())); } } /** * Calls Window2DFeatureSetMatcher to do a dead-simple 2D match between * two featuresets * @param ms1FeatureSet * @param amtFeatureSet * @param lowMassTolerance * @param highMassTolerance * @param lowHTolerance * @param highHTolerance * @return */ public FeatureSetMatcher.FeatureMatchingResult callWindowMatcher(FeatureSet ms1FeatureSet, FeatureSet amtFeatureSet, float lowMassTolerance, float highMassTolerance, float lowHTolerance, float highHTolerance) { Window2DFeatureSetMatcher window2DFeatureSetMatcher = new Window2DFeatureSetMatcher(); window2DFeatureSetMatcher.setMatchingParameters(lowMassTolerance, highMassTolerance, lowHTolerance, highHTolerance, FeatureSetMatcher.DELTA_MASS_TYPE_PPM); return window2DFeatureSetMatcher.matchFeatures(ms1FeatureSet, amtFeatureSet); } /** * Calibrate MS1 features based on an initial match to the AMT database. * Warning! This will invalidate the hash keys in matchingResult when it updates the * features. Do NOT try to use matchingResult after calling this. * @param ms1Features * @param matchingResult * @param showCharts * @return */ public double[] calibrateMS1FeaturesWithMatches(Feature[] ms1Features, FeatureSetMatcher.FeatureMatchingResult matchingResult, boolean showCharts) { _log.debug("calibrateMS1FeaturesWithMatches 1"); double[] result = calculateMassCalibrationParameters(matchingResult, showCharts); double slope = result[1]; double intercept = result[0]; if (showCharts) { int numMatches = matchingResult.size(); double[] ms1FeatureMasses = new double[numMatches]; double[] massErrorData = new double[numMatches]; int i = 0; for (Feature ms1Feature : matchingResult.getMasterSetFeatures()) { ms1FeatureMasses[i] = ms1Feature.getMass() - (float) (ms1Feature.getMass() * slope + intercept); List<Feature> matchedFeatures = matchingResult.get(ms1Feature); Feature matchedAmtFeature = matchedFeatures.get(0); massErrorData[i] = ms1FeatureMasses[i] - matchedAmtFeature.getMass(); i++; } //scatterplot of mass vs. deltaMass PanelWithScatterPlot psp = new PanelWithScatterPlot(ms1FeatureMasses, massErrorData, "MS1 feature mass vs. (signed) match mass error"); psp.setName("After calibration"); psp.setAxisLabels("MS1 Feature Mass", "Absolute (Da) error (MS1 - AMT)"); psp.displayInTab(); } //Warning! This will invalidate the hash keys in matchingResult for (Feature ms1Feature : ms1Features) { ms1Feature.setMass(ms1Feature.getMass() - (float) (ms1Feature.getMass() * slope + intercept)); ms1Feature.updateMz(); } _log.debug("calibrateMS1FeaturesWithMatches, recalibrated"); return result; } /** * Calculate linear mass calibration parameters by doing robust linear regression of * mass error in the matches vs. MS1 feature mass. * @param matchingResult * @param showCharts * @return */ public double[] calculateMassCalibrationParameters(FeatureSetMatcher.FeatureMatchingResult matchingResult, boolean showCharts) { int numMatches = matchingResult.size(); double[] ms1FeatureMasses = new double[numMatches]; double[] massErrorData = new double[numMatches]; int i = 0; double minMass = Double.MAX_VALUE; double maxMass = Double.MIN_VALUE; for (Feature ms1Feature : matchingResult.getMasterSetFeatures()) { ms1FeatureMasses[i] = ms1Feature.getMass(); if (ms1FeatureMasses[i] < minMass) minMass = ms1FeatureMasses[i]; if (ms1FeatureMasses[i] > maxMass) maxMass = ms1FeatureMasses[i]; Feature matchedAmtFeature = matchingResult.get(ms1Feature).get(0); massErrorData[i] = ms1Feature.getMass() - matchedAmtFeature.getMass(); i++; } double[] result = RegressionUtilities.robustRegression(ms1FeatureMasses, massErrorData); _log.debug("calculateMassCalibrationParameters, slope=" + result[1] + ", intercept=" + result[0]); if (showCharts) { //scatterplot of mass vs. deltaMass PanelWithScatterPlot psp = new PanelWithScatterPlot(ms1FeatureMasses, massErrorData, "MS1 feature mass vs. (signed) match mass error"); psp.addLine(result[1], result[0], minMass, maxMass); psp.setName("Before calibration"); psp.setAxisLabels("MS1 Feature Mass", "Absolute (Da) error (MS1 - AMT)"); psp.displayInTab(); } return result; } /** * Separating this out so the interesting code flows better * @param matchingResult */ public void createMassTimeErrorPlots(FeatureSetMatcher.FeatureMatchingResult matchingResult) { int numUnambiguousMatches = 0; for (Feature ms1Feature : matchingResult.getMasterSetFeatures()) { if (matchingResult.get(ms1Feature).size() == 1) numUnambiguousMatches++; } double[] ms1FeatureMasses = new double[numUnambiguousMatches]; double[] ms1FeatureHydrophobicities = new double[numUnambiguousMatches]; double[] massErrorData = new double[numUnambiguousMatches]; double[] elutionErrorData = new double[numUnambiguousMatches]; int i = 0; for (Feature ms1Feature : matchingResult.getMasterSetFeatures()) { ms1FeatureMasses[i] = ms1Feature.getMass(); ms1FeatureHydrophobicities[i] = AmtExtraInfoDef.getObservedHydrophobicity(ms1Feature); if (matchingResult.get(ms1Feature).size() > 1) continue; Feature matchedAmtFeature = matchingResult.get(ms1Feature).get(0); //convert to ppm massErrorData[i] = (ms1Feature.getMass() - matchedAmtFeature.getMass()) * (1000000 / ms1Feature.getMass()); //System.err.println("Error: " + histogramData[i-1] + ", " +ms1Feature.getMass() + ", " + result.get(ms1Feature).get(0).getMass()); elutionErrorData[i] = (AmtExtraInfoDef.getObservedHydrophobicity(ms1Feature) - (AmtExtraInfoDef.getObservedHydrophobicity(matchedAmtFeature))); i++; } //3D mass-elution histogram JDialog perspDialog = new JDialog(); perspDialog.setSize(1000, 800); massTimeErrorPerspectivePlot = new PanelWithRPerspectivePlot(); massTimeErrorPerspectivePlot.setChartHeight(800); massTimeErrorPerspectivePlot.setChartWidth(1000); massTimeErrorPerspectivePlot.setSize(1000, 800); massTimeErrorPerspectivePlot.setTiltAngle(25); massTimeErrorPerspectivePlot.setRotationAngle(-30); massTimeErrorPerspectivePlot.setAxisRVariableNames("Hydrophobicity", "Mass_ppm", "Matches"); double xBinSize = .002; double yBinSize = 1; massTimeErrorPerspectivePlot.plotPointsSummary(elutionErrorData, massErrorData, xBinSize, yBinSize); perspDialog.add(massTimeErrorPerspectivePlot); perspDialog.setVisible(true); //scatterplot of mass vs. deltaMass ScatterPlotDialog spd = new ScatterPlotDialog(ms1FeatureMasses, massErrorData, "MS1 feature mass vs. (signed) match mass error"); spd.setAxisLabels("MS1 Feature Mass", "PPM error (MS1 - AMT)"); massDeltaMassScatterPlot = spd.getPanelWithScatterPlot().getChart(); spd.setVisible(true); //scatterplot of hydrophobicity error vs. mass error ScatterPlotDialog spdHandM = new ScatterPlotDialog(elutionErrorData, massErrorData, "MS1 feature mass error vs. H error"); spdHandM.setAxisLabels("H error", "PPM error"); spdHandM.setVisible(true); } /** * Utility method * @param features * @return */ public static Set<String> createPeptideSetFromFeatures(Feature[] features) { Set<String> peptides = new HashSet<String>(); for (Feature feature : features) { String peptide = MS2ExtraInfoDef.getFirstPeptide(feature); if (peptide != null) peptides.add(peptide); } return peptides; } /** * Remove peptide entries from the AMT database that don't occur in runs that are similar, * in peptide content, to ms2Features. * @param amtDatabase * @param ms2Features * @param matchingResult * @param showCharts * @return */ public AmtDatabase reduceDatabaseByRunSimilarity(AmtDatabase amtDatabase, Feature[] ms2Features, FeatureSetMatcher.FeatureMatchingResult matchingResult, int minRunsToKeep, int maxRunsToKeep, boolean showCharts) { ApplicationContext.infoMessage("Removing unlikely entries from database"); float[] percentMatchedPerRun = new float[amtDatabase.numRuns()]; AmtRunEntry[] runEntries = amtDatabase.getRuns(); final Map<AmtRunEntry, Float> runPeptideOverlapPercentMap = new HashMap<AmtRunEntry, Float>(); final Map<AmtRunEntry, Integer> runPeptideOverlapCountMap = new HashMap<AmtRunEntry, Integer>(); final Map<AmtRunEntry, Integer> runMatchedPeptideCountMap = new HashMap<AmtRunEntry, Integer>(); final Map<AmtRunEntry, Set<String>> runMatchedPeptideMap = new HashMap<AmtRunEntry, Set<String>>(); final Map<AmtRunEntry, Set<String>> runPeptideOverlapMap = new HashMap<AmtRunEntry, Set<String>>(); final Map<AmtRunEntry, Set<String>> runPeptideSetMap = new HashMap<AmtRunEntry, Set<String>>(); Set<String> ms2Peptides = createPeptideSetFromFeatures(ms2Features); Set<Feature> matchedAmtFeatures = matchingResult.getSlaveSetFeatures(); Set<String> matchedAmtPeptides = new HashSet<String>(); for (Feature matchedAmtFeature : matchedAmtFeatures) matchedAmtPeptides.add(MS2ExtraInfoDef.getFirstPeptide(matchedAmtFeature)); for (int i = 0; i < runEntries.length; i++) { if (i % (Math.max(runEntries.length / 10, 1)) == 0) { ApplicationContext.setMessage(Rounder.round(((double) i * 100.0 / (double) runEntries.length), 0) + "% done evaluating runs"); } AmtRunEntry runEntry = runEntries[i]; Set<String> matchedPeptidesInThisRun = new HashSet<String>(); Set<String> peptidesThisRun = new HashSet<String>(); Set<String> ms2PeptidesInCommon = new HashSet<String>(); for (AmtPeptideEntry peptideEntry : amtDatabase.getEntries()) { AmtPeptideEntry.AmtPeptideObservation obs = peptideEntry.getObservationForRun(runEntry); if (obs != null) { String peptideSequence = peptideEntry.getPeptideSequence(); peptidesThisRun.add(peptideSequence); if (ms2Peptides.contains(peptideSequence)) ms2PeptidesInCommon.add(peptideSequence); if (matchedAmtPeptides.contains(peptideSequence)) matchedPeptidesInThisRun.add(peptideSequence); } } float peptideMatchesPercent = (((float) ms2PeptidesInCommon.size()) / ((float) peptidesThisRun.size()) * 100); // System.err.println("Matches this run: " + numMassMatchedFeatures + " (" + massMatchesPercent + "%)"); percentMatchedPerRun[i] = peptideMatchesPercent; runPeptideOverlapPercentMap.put(runEntry, peptideMatchesPercent); runPeptideOverlapCountMap.put(runEntry, ms2PeptidesInCommon.size()); runMatchedPeptideCountMap.put(runEntry, matchedPeptidesInThisRun.size()); runMatchedPeptideMap.put(runEntry, matchedPeptidesInThisRun); runPeptideOverlapMap.put(runEntry, ms2PeptidesInCommon); runPeptideSetMap.put(runEntry, peptidesThisRun); } AmtRunEntry[] runEntriesSorted = new AmtRunEntry[amtDatabase.numRuns()]; for (int i = 0; i < amtDatabase.numRuns(); i++) runEntriesSorted[i] = runEntries[i]; Comparator<AmtRunEntry> runComparatorByPeptideMatchPercentDesc = new Comparator<AmtRunEntry>() { public int compare(AmtRunEntry o1, AmtRunEntry o2) { float o1Matches = runPeptideOverlapPercentMap.get(o1); float o2Matches = runPeptideOverlapPercentMap.get(o2); return o1Matches < o2Matches ? 1 : o1Matches > o2Matches ? -1 : 0; } }; Arrays.sort(runEntriesSorted, runComparatorByPeptideMatchPercentDesc); float[] runIndexes = new float[runEntriesSorted.length]; float[] cumOverlapCount = new float[runEntriesSorted.length]; float[] cumMatchCount = new float[runEntriesSorted.length]; float[] cumOverlapMatchCountRatios = new float[runEntriesSorted.length]; Set<String> cumOverlap = new HashSet<String>(); Set<String> cumMatches = new HashSet<String>(); Set<String> allPeptidesToKeep = new HashSet<String>(); boolean stillAddingPeptides = true; //first derivative of the match:count ratio progression float[] deltaCumOverlapMatchCountRatios = new float[runEntriesSorted.length]; float[] deltaDelta = new float[runEntriesSorted.length]; float[] matchAdditions = new float[runEntriesSorted.length]; for (int i = 0; i < runEntriesSorted.length; i++) { cumOverlap.addAll(runPeptideOverlapMap.get(runEntriesSorted[i])); cumMatches.addAll(runMatchedPeptideMap.get(runEntriesSorted[i])); runIndexes[i] = i; cumOverlapCount[i] = cumOverlap.size(); cumMatchCount[i] = cumMatches.size(); if (i > 20) matchAdditions[i] = cumMatchCount[i] - cumMatchCount[i - 1]; cumOverlapMatchCountRatios[i] = cumOverlapCount[i] / cumMatchCount[i]; deltaCumOverlapMatchCountRatios[i] = cumOverlapMatchCountRatios[i] - (i == 0 ? 0 : cumOverlapMatchCountRatios[i - 1]); deltaDelta[i] = deltaCumOverlapMatchCountRatios[i] - (i == 0 ? 0 : deltaCumOverlapMatchCountRatios[i - 1]); //if, that is, we're adding a smaller proportion of peptides from MS/MS overlap to peptides //from AMT matches than we were a second ago... // if (i>0 && deltaCumOverlapMatchCountRatios[i] >= -0.01) } float[] matchAdditionsDifferences = new float[runEntriesSorted.length]; int maxMatchAdditionsDifference = Integer.MIN_VALUE; int maxMatchAdditionsDifferenceIndex = 0; int windowSize = 10; for (int i = Math.max(windowSize, minRunsToKeep - 1); i < Math.min(runEntriesSorted.length - windowSize, maxRunsToKeep - 1); i++) { int matchAdditionsDifference = (int) ((cumMatchCount[i + windowSize] - cumMatchCount[i]) - (cumMatchCount[i] - cumMatchCount[i - windowSize])); //System.err.println(i + ", " + (cumMatchCount[i+windowSize] - cumMatchCount[i]) + " - " + (cumMatchCount[i] - cumMatchCount[i-windowSize]) + " = " + matchAdditionsDifference); matchAdditionsDifferences[i] = matchAdditionsDifference; if (matchAdditionsDifference > maxMatchAdditionsDifference) { maxMatchAdditionsDifference = matchAdditionsDifference; maxMatchAdditionsDifferenceIndex = i; } } //System.err.println("Max additions difference: " + maxMatchAdditionsDifference + ", index=" + maxMatchAdditionsDifferenceIndex); int lastRunIndexToKeep = maxMatchAdditionsDifferenceIndex; if (lastRunIndexToKeep < minRunsToKeep - 1) lastRunIndexToKeep = minRunsToKeep - 1; if (lastRunIndexToKeep > maxRunsToKeep - 1) lastRunIndexToKeep = maxRunsToKeep - 1; for (int i = 0; i <= lastRunIndexToKeep; i++) allPeptidesToKeep.addAll(runPeptideSetMap.get(runEntriesSorted[i])); if (showCharts) { PanelWithLineChart pwlc2 = new PanelWithLineChart(runIndexes, cumOverlapCount, "cumul overlap count"); pwlc2.addData(runIndexes, cumMatchCount, "cumul match count"); pwlc2.addData(runIndexes, matchAdditions, "# of new matches added this run"); pwlc2.addData(runIndexes, matchAdditionsDifferences, "match addition differences"); float[] cutoffLineX = new float[2]; cutoffLineX[0] = lastRunIndexToKeep; cutoffLineX[1] = lastRunIndexToKeep; float[] cutoffLineY = new float[2]; cutoffLineY[0] = 0; cutoffLineY[1] = 100; pwlc2.addData(cutoffLineX, cutoffLineY, "Cutoff"); ChartDialog cd2 = new ChartDialog(pwlc2); cd2.setTitle("cumulative"); cd2.setVisible(true); } if (lastRunIndexToKeep == -1) { ApplicationContext.infoMessage("Keeping all runs!"); return amtDatabase; } else ApplicationContext.infoMessage("Keeping peptide entries from " + (lastRunIndexToKeep + 1) + " out of " + amtDatabase.numRuns() + " runs"); AmtDatabase result = (AmtDatabase) amtDatabase.waistDeepCopy(); for (AmtPeptideEntry peptideEntry : result.getEntries()) if (!allPeptidesToKeep.contains(peptideEntry.getPeptideSequence())) result.removeEntry(peptideEntry.getPeptideSequence()); ApplicationContext.infoMessage( "Kept " + result.numEntries() + " peptide entries, out of " + amtDatabase.numEntries()); ApplicationContext.infoMessage("New database: " + result.toString()); return result; } //Getters and Setters public float getMassMatchDeltaMass() { return massMatchDeltaMass; } public void setMassMatchDeltaMass(float massMatchDeltaMass) { this.massMatchDeltaMass = massMatchDeltaMass; } public int getMassMatchDeltaMassType() { return massMatchDeltaMassType; } public void setMassMatchDeltaMassType(int massMatchDeltaMassType) { this.massMatchDeltaMassType = massMatchDeltaMassType; } public float getRealMatchDeltaMass() { return realMatchDeltaMass; } public void setRealMatchDeltaMass(float realMatchDeltaMass) { this.realMatchDeltaMass = realMatchDeltaMass; } public int getRealMatchDeltaMassType() { return realMatchDeltaMassType; } public void setRealMatchDeltaMassType(int realMatchDeltaMassType) { this.realMatchDeltaMassType = realMatchDeltaMassType; } public float getRealMatchDeltaElution() { return realMatchDeltaElution; } public void setRealMatchDeltaElution(float realMatchDeltaElution) { this.realMatchDeltaElution = realMatchDeltaElution; } public void setBuildCharts(boolean buildCharts) { this.buildCharts = buildCharts; } public JFreeChart getMassMatchScatterplot() { return massMatchScatterplot; } public JFreeChart getTimeHydrophobicityMappingChart() { return timeHydrophobicityMappingChart; } public double getMaxRegressionLeverageNumerator() { return maxRegressionLeverageNumerator; } public void setMaxRegressionLeverageNumerator(double leverageNumeratorModalRegression) { this.maxRegressionLeverageNumerator = leverageNumeratorModalRegression; } public double getMaxRegressionStudentizedResidual() { return maxRegressionStudRes; } public void setMaxRegressionStudentizedResidual(double maxRegressionStudRes) { this.maxRegressionStudRes = maxRegressionStudRes; } public double getAmtFeatureMassAdjustment() { return amtFeatureMassAdjustment; } public void setAmtFeatureMassAdjustment(double amtFeatureMassAdjustment) { this.amtFeatureMassAdjustment = amtFeatureMassAdjustment; } public int getQuantileRegressionMinFeatures() { return quantileRegressionMinFeatures; } public void setQuantileRegressionMinFeatures(int quantileRegressionMinFeatures) { this.quantileRegressionMinFeatures = quantileRegressionMinFeatures; } public JFreeChart getMassCalibrationChart() { return massCalibrationChart; } /** * Define the "dimensions" of the AMT database. * * This only has any meaning for a single multi-fraction database like an IPAS, with one- * or (really) two-dimensional fractionation. * @param amtDatabaseStructure */ public void defineAMTDBStructure( Fractionation2DUtilities.FractionatedAMTDatabaseStructure amtDatabaseStructure) { this.amtDatabaseStructure = amtDatabaseStructure; amtDBDimensionsDefined = true; } public Fractionation2DUtilities.FractionatedAMTDatabaseStructure getAMTDBStructure() { return amtDatabaseStructure; } /** * This is for paring down an AMT database by removing runs without many peptides * in common with the MS2 features provided. For getting rid of runs in a heavily * fractionated database. Unnecessary? * @param amtDB * @param ms2FeatureSetToMatch * @param maxEntries * @param minRunMassMatchPercent * @param maxRuns * @param showCharts * @param ms1Features * @param ms2ModificationsForMatching * @return The modified database, or null if it can't be done */ public AmtDatabase buildAmtDatabaseForPeptideMatches(AmtDatabase amtDB, FeatureSet ms2FeatureSetToMatch, int maxEntries, int minRunMassMatchPercent, int maxRuns, boolean showCharts, Feature[] ms1Features, MS2Modification[] ms2ModificationsForMatching) { ApplicationContext.infoMessage("Discarding runs with insufficient peptide matches to MS2 features..."); int oldNumRuns = amtDB.numRuns(); AmtDatabase result = AmtDatabaseManager.removeRunsWithoutPeptideMatches(amtDB, ms2FeatureSetToMatch.getFeatures(), minRunMassMatchPercent, maxEntries, maxRuns, amtDatabaseStructure, showCharts, ms1Features, ms2ModificationsForMatching, this); if (result.numEntries() == 0) { throw new RuntimeException("FAILURE: No runs in the AMT database match " + minRunMassMatchPercent + "% of peptides from MS2 features in file " + ms2FeatureSetToMatch.getSourceFile().getName()); } else { ApplicationContext.infoMessage("Retained " + result.numRuns() + " out of " + oldNumRuns + " runs in AMT database that matched at least " + minRunMassMatchPercent + "% of features"); ApplicationContext.infoMessage("Reduced DB: " + result.toString()); } return result; } /** * This method removes runs from an AMT database based on distance in fraction * space from the run we're matching to. This is purely for demonstration of * how well it doesn't work, should be discarded after paper is written. * @param amtDB * @param ms2FeatureSetToMatch * @param maxEntries * @param minRunMassMatchPercent * @param maxRuns * @param showCharts * @return The modified database, or null if it can't be done */ public AmtDatabase buildAmtDatabaseForGeographicRestriction(AmtDatabase amtDB, FeatureSet ms2FeatureSetToMatch, int maxEntries, int minRunMassMatchPercent, int maxRuns, boolean showCharts) { AmtRunEntry[] runs = amtDB.getRuns(); String ms2FileName = ms2FeatureSetToMatch.getSourceFile().getName(); int ms2Index = 0; for (ms2Index = 0; ms2Index < runs.length; ms2Index++) { String runFileName = runs[ms2Index].getPepXmlFilename(); if (runFileName.substring(0, runFileName.indexOf(".")) .equalsIgnoreCase(ms2FileName.substring(0, ms2FileName.indexOf(".")))) break; } Pair<Integer, int[]> expAndPos = amtDatabaseStructure.calculateExperimentAndPosition(ms2Index); int runEntryCol = expAndPos.second[0]; int runEntryRow = expAndPos.second[1]; ApplicationContext.infoMessage("Discarding runs far away from MS1 run..."); int oldNumRuns = amtDB.numRuns(); AmtDatabase result = AmtDatabaseManager.removeRunsByStructure(amtDB, runEntryRow, runEntryCol, expAndPos.first, maxEntries, maxRuns, amtDatabaseStructure, showCharts); if (result.numEntries() == 0) { throw new RuntimeException( "FAILURE: unable to satisfy maxEntries, maxRuns constraints using runs geographically close to " + ms2FeatureSetToMatch.getSourceFile().getName()); } else { ApplicationContext.infoMessage("Retained " + result.numRuns() + " out of " + oldNumRuns + " runs in AMT database that matched at least " + minRunMassMatchPercent + "% of features"); ApplicationContext.infoMessage("Reduced DB: " + result.toString()); } return result; } public float getMinMatchProbabilityToKeep() { return minMatchProbabilityToKeep; } public void setMinMatchProbabilityToKeep(float minMatchProbabilityToKeep) { this.minMatchProbabilityToKeep = minMatchProbabilityToKeep; } public AmtMatchProbabilityAssigner getProbabilityAssigner() { return probabilityAssigner; } public void setProbabilityAssigner(AmtMatchProbabilityAssigner probabilityAssigner) { this.probabilityAssigner = probabilityAssigner; } public boolean isDecoyMatch() { return doDecoyMatch; } public void setDecoyMatch(boolean decoyMatch) { this.doDecoyMatch = decoyMatch; } public int getMinEMIterations() { return minEMIterations; } public void setMinEMIterations(int minEMIterations) { this.minEMIterations = minEMIterations; } public int getMaxEMIterations() { return maxEMIterations; } public void setMaxEMIterations(int maxEMIterations) { this.maxEMIterations = maxEMIterations; } public float getMinSecondBestProbabilityDifference() { return minSecondBestProbabilityDifference; } public void setMinSecondBestProbabilityDifference(float minSecondBestProbabilityDifference) { this.minSecondBestProbabilityDifference = minSecondBestProbabilityDifference; } public float getMaxSecondBestProbability() { return maxSecondBestProbability; } public void setMaxSecondBestProbability(float maxSecondBestProbability) { this.maxSecondBestProbability = maxSecondBestProbability; } public boolean isUseMs1TimesForAlignment() { return useMs1TimesForAlignment; } public void setUseMs1TimesForAlignment(boolean useMs1TimesForAlignment) { this.useMs1TimesForAlignment = useMs1TimesForAlignment; } public float getMs1Ms2MassTolerancePPM() { return ms1Ms2MassTolerancePPM; } public void setMs1Ms2MassTolerancePPM(float ms1Ms2MassTolerancePPM) { this.ms1Ms2MassTolerancePPM = ms1Ms2MassTolerancePPM; } public float getMs1Ms2TimeToleranceSeconds() { return ms1Ms2TimeToleranceSeconds; } public void setMs1Ms2TimeToleranceSeconds(float ms1Ms2TimeToleranceSeconds) { this.ms1Ms2TimeToleranceSeconds = ms1Ms2TimeToleranceSeconds; } public int getNonlinearMappingPolynomialDegree() { return nonlinearMappingPolynomialDegree; } public void setNonlinearMappingPolynomialDegree(int nonlinearMappingPolynomialDegree) { this.nonlinearMappingPolynomialDegree = nonlinearMappingPolynomialDegree; } public float getMaxMatchFDRToKeep() { return maxMatchFDRToKeep; } public void setMaxMatchFDRToKeep(float maxMatchFDRToKeep) { this.maxMatchFDRToKeep = maxMatchFDRToKeep; } public int getMaxRProbAssignmentMillis() { return maxRProbAssignmentMillis; } public void setMaxRProbAssignmentMillis(int maxRProbAssignmentMillis) { this.maxRProbAssignmentMillis = maxRProbAssignmentMillis; } }