Java tutorial
/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.lcms; import java.util.List; import java.util.Map; import java.util.HashMap; import java.util.Set; import java.util.HashSet; import java.util.Arrays; import java.util.ArrayList; import java.util.Iterator; import java.util.Collections; import java.util.Comparator; import java.io.IOException; import java.io.PrintStream; import java.io.FileOutputStream; import org.apache.commons.lang3.tuple.Pair; public class MS2 { class YZ { Double mz; Double intensity; public YZ(Double mz, Double intensity) { this.mz = mz; this.intensity = intensity; } } class XZ { Double time; Double intensity; public XZ(Double t, Double i) { this.time = t; this.intensity = i; } } class MS2Collected { Double triggerTime; Double voltage; List<YZ> ms2; public MS2Collected(Double trigTime, Double collisionEv, List<YZ> ms2) { this.triggerTime = trigTime; this.ms2 = ms2; this.voltage = collisionEv; } } // Rounding upto 2 decimal places should result in pretty // much an identical match on mz in the MS2 spectra final static Double MS2_MZ_COMPARE_TOLERANCE = 0.005; // In the MS1 case, we look for a very tight window // because we do not noise to broaden our signal final static Double MS1_MZ_TOLERANCE = 0.001; // when aggregating the MS1 signal, we do not expect // more than these number of measurements within the // mz window specified by the tolerance above static final Integer MAX_MZ_IN_WINDOW = 3; // Use the time window to only identify identical scans // hence the infinitely small window final static Double TIME_TOLERANCE = 0.1 / 1e3d; // the number of peaks to compare across the MS2s of // the standard and the strain final static Integer REPORT_TOP_N = 40; // the threshold we compare against after computing the // weighted matching of peaks (sum of [0,1] intensities) // Rationale for keeping it at 1.0: If the MS2 spectra // consists, in the degenerate case of 1 peak, and that // matches across the two spectra, we should declare success final static Double THRESHOLD_WEIGHTED_PEAK = 1.0; // log? final static boolean LOG_PEAKS_TO_STDOUT = false; private List<LCMS2MZSelection> filterByTriggerMass(Iterator<LCMS2MZSelection> ms2Scans, Double targetMass) { // Look for precisely this time point, so infinitely small window Double mzLow = targetMass - MS1_MZ_TOLERANCE; Double mzHigh = targetMass + MS1_MZ_TOLERANCE; List<LCMS2MZSelection> relevantMS2Scans = new ArrayList<>(); while (ms2Scans.hasNext()) { LCMS2MZSelection scan = ms2Scans.next(); Double targetWindow = scan.getIsolationWindowTargetMZ(); if (targetWindow >= mzLow && targetWindow <= mzHigh) { relevantMS2Scans.add(scan); } } if (relevantMS2Scans.size() < 1) { String errmsg = String.format("SEVERE ERR: found no matching MS2 scans for MS1 target mass %f", targetMass); throw new RuntimeException(errmsg); } return relevantMS2Scans; } List<YZ> spectrumToYZList(LCMSSpectrum spectrum) { List<YZ> yzList = new ArrayList<>(spectrum.getIntensities().size()); for (Pair<Double, Double> p : spectrum.getIntensities()) { yzList.add(new YZ(p.getLeft(), p.getRight())); } return yzList; } List<MS2Collected> getSpectraForMatchingScans(List<LCMS2MZSelection> relevantMS2Selections, Iterator<LCMSSpectrum> ms2Spectra) { List<MS2Collected> ms2s = new ArrayList<>(); Iterator<LCMS2MZSelection> selectionIterator = relevantMS2Selections.iterator(); if (!selectionIterator.hasNext()) { // Previous checks should have prevented this. throw new RuntimeException("No scans available for spectrum matching"); } LCMS2MZSelection thisSelection = selectionIterator.next(); // TODO: handle other time units more gracefully. if (!"minute".equals(thisSelection.getTimeUnit())) { throw new RuntimeException( String.format("Expected 'minute' for MS2 scan selection time unit, but found '%s'", thisSelection.getTimeUnit())); } Double ms2Time = thisSelection.getTimeVal() * 60.0d; // mzML times tend to be in minutes; Double collisionEnergy = thisSelection.getCollisionEnergy(); // assumed in electronvols Double tLow = ms2Time - TIME_TOLERANCE; Double tHigh = ms2Time + TIME_TOLERANCE; while (ms2Spectra.hasNext()) { boolean advanceMS2Selection = false; LCMSSpectrum spectrum = ms2Spectra.next(); Double sTime = spectrum.getTimeVal(); if (sTime >= tLow && sTime <= tHigh) { // We found a matching scan! MS2Collected ms2 = new MS2Collected(ms2Time, collisionEnergy, this.spectrumToYZList(spectrum)); ms2s.add(ms2); advanceMS2Selection = true; } else if (sTime > ms2Time) { System.err.format( "ERROR: found spectrum at time %f when searching for MS2 scan at %f, skipping MS2 scan\n", sTime, ms2Time); advanceMS2Selection = true; } // Otherwise, this spectrum's time doesn't match the time point of the next relevant MS2 scan. Skip it! if (advanceMS2Selection) { if (!selectionIterator.hasNext()) { // No more relevant scans to search for. break; } thisSelection = selectionIterator.next(); ms2Time = thisSelection.getTimeVal() * 60.0d; // Assume time units are consistent across all mzML entries. tLow = ms2Time - TIME_TOLERANCE; tHigh = ms2Time + TIME_TOLERANCE; } } if (selectionIterator.hasNext()) { System.err.format( "ERROR: ran out of spectra to match against MS2 scans with some scans still unmatched.\n"); } return ms2s; } private Pair<Double, Double> getMaxAndNth(List<YZ> mzInt, int N) { if (N > mzInt.size()) N = mzInt.size(); List<YZ> mzIonsByInt = new ArrayList<>(mzInt); Collections.sort(mzIonsByInt, new Comparator<YZ>() { public int compare(YZ a, YZ b) { return b.intensity.compareTo(a.intensity); } }); // lets normalize to the largest intensity value we have. Double largest = mzIonsByInt.get(0).intensity; Double NthLargest = mzIonsByInt.get(N - 1).intensity; // print out the top N peaks if (LOG_PEAKS_TO_STDOUT) { for (int i = 0; i < N; i++) { YZ yz = mzIonsByInt.get(i); System.out.format("mz: %8.4f\t intensity: %6.2f%%\n", yz.mz, 100 * yz.intensity / largest); } System.out.println("\n"); } return Pair.of(largest, NthLargest); } private double extractMZ(double mzWanted, List<Pair<Double, Double>> intensities) { double intensityFound = 0; int numWithinPrecision = 0; double mzLowRange = mzWanted - MS1_MZ_TOLERANCE; double mzHighRange = mzWanted + MS1_MZ_TOLERANCE; // we expect there to be pretty much only one intensity value in the precision // range we are looking at. But if a lot of masses show up then complain for (Pair<Double, Double> mz_int : intensities) { double mz = mz_int.getLeft(); double intensity = mz_int.getRight(); if (mz >= mzLowRange && mz <= mzHighRange) { intensityFound += intensity; numWithinPrecision++; } } if (numWithinPrecision > MAX_MZ_IN_WINDOW) { System.out.format("Only expected %d, but found %d in the mz range [%f, %f]\n", MAX_MZ_IN_WINDOW, numWithinPrecision, mzLowRange, mzHighRange); } return intensityFound; } private List<XZ> getMS1(double mz, Iterator<LCMSSpectrum> ms1Spectra) { List<XZ> ms1 = new ArrayList<>(); while (ms1Spectra.hasNext()) { LCMSSpectrum timepoint = ms1Spectra.next(); // get all (mz, intensity) at this timepoint List<Pair<Double, Double>> intensities = timepoint.getIntensities(); // this time point is valid to look at if its max intensity is around // the mass we care about. So lets first get the max peak location double intensityForMz = extractMZ(mz, intensities); // the above is Pair(mz_extracted, intensity), where mz_extracted = mz // we now add the timepoint val and the intensity to the output ms1.add(new XZ(timepoint.getTimeVal(), intensityForMz)); } return ms1; } private Double ms1IntensityMax(List<XZ> ms1) { Double maxTime = 0.0; Double maxIntensity = 0.0; for (XZ obs : ms1) { if (obs.intensity > maxIntensity) { maxIntensity = obs.intensity; maxTime = obs.time; } } return maxTime; } private MS2Collected findSpectraClosestToMS1Apex(List<MS2Collected> ms2s, Double ms1Apex) { MS2Collected closest = null; Double minDist = null; for (MS2Collected ms2 : ms2s) { Double howFar = Math.abs(ms2.triggerTime - ms1Apex); if (closest == null || minDist > howFar) { closest = ms2; minDist = howFar; } } return closest; } private static boolean areNCFiles(String[] fnames) { for (String n : fnames) { System.out.println(".nc file = " + n); if (!n.endsWith(".nc")) return false; } return true; } private MS2Collected pickMS2ClosestToMS1Peak(Double mz, String ms2mzML, String ms2nc, String ms1) throws Exception { // see where in the ms1 this mz peaks List<XZ> ms1Spectra = getMS1(mz, new LCMSNetCDFParser().getIterator(ms1)); Double apexTime = ms1IntensityMax(ms1Spectra); System.out.format("Max in MS1 at time %8.4f\n", apexTime); // the first .nc is the ion trigger on the mz extracted List<LCMS2MZSelection> matchingScans = filterByTriggerMass(new LCMS2mzMLParser().getIterator(ms2mzML), mz); Iterator<LCMSSpectrum> spectrumIterator = new LCMSNetCDFParser().getIterator(ms2nc); List<MS2Collected> ms2Spectra = getSpectraForMatchingScans(matchingScans, spectrumIterator); MS2Collected ms2Main = findSpectraClosestToMS1Apex(ms2Spectra, apexTime); return ms2Main; } private MS2Collected normalizeAndThreshold(MS2Collected ms2) { Pair<Double, Double> largestAndNth = getMaxAndNth(ms2.ms2, REPORT_TOP_N); Double largest = largestAndNth.getLeft(); Double nth = largestAndNth.getRight(); List<YZ> ms2AboveThreshold = new ArrayList<>(); // print out the spectra to outDATA for (YZ yz : ms2.ms2) { // threshold to remove everything that is not in the top peaks if (yz.intensity < nth) continue; ms2AboveThreshold.add(new YZ(yz.mz, 100.0 * yz.intensity / largest)); } return new MS2Collected(ms2.triggerTime, ms2.voltage, ms2AboveThreshold); } private YZ getMatchingPeak(YZ toLook, List<YZ> matchAgainst) { Double mz = toLook.mz; YZ match = null; YZ minDistMatch = null; for (YZ peak : matchAgainst) { Double dist = Math.abs(peak.mz - mz); // we look for a peak that is within MS2_MZ_COMPARE_TOLERANCE of mz if (dist < MS2_MZ_COMPARE_TOLERANCE) { // this is a match, make sure it is the only match if (match != null) { System.out.format( "SEVERE: MS2_MZ_COMPARE_TOLERANCE might be too wide. MS2 peak %.4f has >1 matches.\n" + "\tMatch 1: %.4f\t Match 2: %.4f\n", mz, match.mz, peak.mz); } match = peak; } // bookkeeping for how close it got, in case no matches within precision if (minDistMatch == null || Math.abs(minDistMatch.mz - mz) > dist) { minDistMatch = peak; } } if (match != null) { System.out.format("Peak %8.4f (%6.2f%%) - MATCHES - PEAK: %8.4f (%6.2f%%) at DISTANCE: %.5f\n", mz, toLook.intensity, match.mz, match.intensity, Math.abs(match.mz - mz)); } else { System.out.format("Peak %8.4f (%6.2f%%) - NO MTCH - CLOSEST: %8.4f (%6.2f%%) at DISTANCE: %.5f\n", mz, toLook.intensity, minDistMatch.mz, minDistMatch.intensity, Math.abs(minDistMatch.mz - mz)); } return match; } private Double weightedMatch(MS2Collected A, MS2Collected B) { Double weightedSum = 0.0; // we should go through the peaks in descending order of intensity // so that get reported to the output in that order List<YZ> orderedBms2 = new ArrayList<>(B.ms2); Collections.sort(orderedBms2, new Comparator<YZ>() { public int compare(YZ a, YZ b) { return b.intensity.compareTo(a.intensity); } }); // once a peak is matched, we should remove it from the available // set to be matched further List<YZ> toMatch = new ArrayList<>(A.ms2); for (YZ peak : orderedBms2) { YZ matchInA = getMatchingPeak(peak, toMatch); if (matchInA != null) { // this YZ peak in B has a match `matchInA` in A's MS2 peaks // if the aggregate peak across both spectra is high, we give it a // high score; by weighing it with the intensity percentage Double intensityPc = (peak.intensity + matchInA.intensity) / 2.0; // scale it back to [0,1] from [0,100]% Double intensity = intensityPc / 100; weightedSum += intensity; toMatch.remove(matchInA); } } return weightedSum; } private boolean doMatch(MS2Collected A, MS2Collected B) { Double weightedPeakMatch = weightedMatch(A, B); System.out.format("Weighted peak match: %.2f >= %.2f\n", weightedPeakMatch, THRESHOLD_WEIGHTED_PEAK); boolean isMatch = weightedPeakMatch >= THRESHOLD_WEIGHTED_PEAK; return isMatch; } private boolean doesMS2Match(Double mz, String Ams2mzML, String Ams2nc, String Ams1, String Bms2mzML, String Bms2nc, String Bms1, String outPrefix, String fmt) throws IOException { MS2Collected Ams2Peaks = null, Bms2Peaks = null; try { Ams2Peaks = normalizeAndThreshold(pickMS2ClosestToMS1Peak(mz, Ams2mzML, Ams2nc, Ams1)); System.out.println("Standard: MS2 fragmentation trigged on " + mz); } catch (Exception e) { System.out.println("Standard: " + e.getMessage()); } try { Bms2Peaks = normalizeAndThreshold(pickMS2ClosestToMS1Peak(mz, Bms2mzML, Bms2nc, Bms1)); System.out.println("Sample: MS2 fragmentation trigged on " + mz); } catch (Exception e) { System.out.println("Sample: " + e.getMessage()); } boolean isMatch = false; if (Ams2Peaks != null && Bms2Peaks != null) { isMatch = doMatch(Ams2Peaks, Bms2Peaks); // plotting can throw an io exception plot(Ams2Peaks, Bms2Peaks, mz, outPrefix, fmt); } return isMatch; } private void plot(MS2Collected Ams2Peaks, MS2Collected Bms2Peaks, Double mz, String outPrefix, String fmt) throws IOException { MS2Collected[] ms2Spectra = new MS2Collected[] { Ams2Peaks, Bms2Peaks }; String outPDF = outPrefix + "." + fmt; String outDATA = outPrefix + ".data"; // Write data output to outfile PrintStream out = new PrintStream(new FileOutputStream(outDATA)); List<String> plotID = new ArrayList<>(ms2Spectra.length); for (MS2Collected yzSlice : ms2Spectra) { plotID.add(String.format("time: %.4f, volts: %.4f", yzSlice.triggerTime, yzSlice.voltage)); // print out the spectra to outDATA for (YZ yz : yzSlice.ms2) { out.format("%.4f\t%.4f\n", yz.mz, yz.intensity); out.flush(); } // delimit this dataset from the rest out.print("\n\n"); } // close the .data out.close(); // render outDATA to outPDF using gnuplot // 105.0 here means 105% for the y-range of a [0%:100%] plot. We want to leave some buffer space at // at the top, and hence we go a little outside of the 100% max range. new Gnuplotter().plot2DImpulsesWithLabels(outDATA, outPDF, plotID.toArray(new String[plotID.size()]), mz + 50.0, "mz", 105.0, "intensity (%)", fmt); } public static void main(String[] args) throws Exception { if (args.length < 8 || !areNCFiles(new String[] { args[3], args[4] }) || !areNCFiles(new String[] { args[6], args[7] })) { throw new RuntimeException("Needs: \n" + "(1) mz for main product, e.g., 431.1341983 (ononin) \n" + "(2) prefix for .data and rendered .pdf \n" + "(3) STD: mzML file from MS2 run (to extract trigger masses) \n" + "(4) STD: NetCDF .nc file 02.nc from MSMS run \n" + "(5) STD: NetCDF .nc file 01.nc from MS1 run \n" + "(6) YST: mzML file from MS2 run (to extract trigger masses) \n" + "(7) YST: NetCDF .nc file 02.nc from MSMS run \n" + "(8) YST: NetCDF .nc file 01.nc from MS1 run"); } String fmt = "pdf"; Double mz = Double.parseDouble(args[0]); String outPrefix = args[1]; String Ams2mzml = args[2]; String Ams2nc = args[3]; String Ams1 = args[4]; String Bms2mzml = args[5]; String Bms2nc = args[6]; String Bms1 = args[7]; MS2 c = new MS2(); boolean areMatch = c.doesMS2Match(mz, Ams2mzml, Ams2nc, Ams1, Bms2mzml, Bms2nc, Bms1, outPrefix, fmt); System.out.format("MS2 spectra match: %s\n", (areMatch ? "YES" : "NO")); } }