com.act.lcms.MS2.java Source code

Java tutorial

Introduction

Here is the source code for com.act.lcms.MS2.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms;

import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Collections;
import java.util.Comparator;
import java.io.IOException;
import java.io.PrintStream;
import java.io.FileOutputStream;
import org.apache.commons.lang3.tuple.Pair;

public class MS2 {

    class YZ {
        Double mz;
        Double intensity;

        public YZ(Double mz, Double intensity) {
            this.mz = mz;
            this.intensity = intensity;
        }
    }

    class XZ {
        Double time;
        Double intensity;

        public XZ(Double t, Double i) {
            this.time = t;
            this.intensity = i;
        }
    }

    class MS2Collected {
        Double triggerTime;
        Double voltage;
        List<YZ> ms2;

        public MS2Collected(Double trigTime, Double collisionEv, List<YZ> ms2) {
            this.triggerTime = trigTime;
            this.ms2 = ms2;
            this.voltage = collisionEv;
        }
    }

    // Rounding upto 2 decimal places should result in pretty 
    // much an identical match on mz in the MS2 spectra
    final static Double MS2_MZ_COMPARE_TOLERANCE = 0.005;

    // In the MS1 case, we look for a very tight window 
    // because we do not noise to broaden our signal
    final static Double MS1_MZ_TOLERANCE = 0.001;

    // when aggregating the MS1 signal, we do not expect
    // more than these number of measurements within the
    // mz window specified by the tolerance above
    static final Integer MAX_MZ_IN_WINDOW = 3;

    // Use the time window to only identify identical scans
    // hence the infinitely small window
    final static Double TIME_TOLERANCE = 0.1 / 1e3d;

    // the number of peaks to compare across the MS2s of 
    // the standard and the strain
    final static Integer REPORT_TOP_N = 40;

    // the threshold we compare against after computing the
    // weighted matching of peaks (sum of [0,1] intensities)
    // Rationale for keeping it at 1.0: If the MS2 spectra
    // consists, in the degenerate case of 1 peak, and that
    // matches across the two spectra, we should declare success
    final static Double THRESHOLD_WEIGHTED_PEAK = 1.0;

    // log?
    final static boolean LOG_PEAKS_TO_STDOUT = false;

    private List<LCMS2MZSelection> filterByTriggerMass(Iterator<LCMS2MZSelection> ms2Scans, Double targetMass) {
        // Look for precisely this time point, so infinitely small window
        Double mzLow = targetMass - MS1_MZ_TOLERANCE;
        Double mzHigh = targetMass + MS1_MZ_TOLERANCE;

        List<LCMS2MZSelection> relevantMS2Scans = new ArrayList<>();
        while (ms2Scans.hasNext()) {
            LCMS2MZSelection scan = ms2Scans.next();
            Double targetWindow = scan.getIsolationWindowTargetMZ();
            if (targetWindow >= mzLow && targetWindow <= mzHigh) {
                relevantMS2Scans.add(scan);
            }
        }

        if (relevantMS2Scans.size() < 1) {
            String errmsg = String.format("SEVERE ERR: found no matching MS2 scans for MS1 target mass %f",
                    targetMass);
            throw new RuntimeException(errmsg);
        }

        return relevantMS2Scans;
    }

    List<YZ> spectrumToYZList(LCMSSpectrum spectrum) {
        List<YZ> yzList = new ArrayList<>(spectrum.getIntensities().size());
        for (Pair<Double, Double> p : spectrum.getIntensities()) {
            yzList.add(new YZ(p.getLeft(), p.getRight()));
        }
        return yzList;
    }

    List<MS2Collected> getSpectraForMatchingScans(List<LCMS2MZSelection> relevantMS2Selections,
            Iterator<LCMSSpectrum> ms2Spectra) {
        List<MS2Collected> ms2s = new ArrayList<>();

        Iterator<LCMS2MZSelection> selectionIterator = relevantMS2Selections.iterator();
        if (!selectionIterator.hasNext()) {
            // Previous checks should have prevented this.
            throw new RuntimeException("No scans available for spectrum matching");
        }
        LCMS2MZSelection thisSelection = selectionIterator.next();
        // TODO: handle other time units more gracefully.
        if (!"minute".equals(thisSelection.getTimeUnit())) {
            throw new RuntimeException(
                    String.format("Expected 'minute' for MS2 scan selection time unit, but found '%s'",
                            thisSelection.getTimeUnit()));
        }
        Double ms2Time = thisSelection.getTimeVal() * 60.0d; // mzML times tend to be in minutes;
        Double collisionEnergy = thisSelection.getCollisionEnergy(); // assumed in electronvols
        Double tLow = ms2Time - TIME_TOLERANCE;
        Double tHigh = ms2Time + TIME_TOLERANCE;

        while (ms2Spectra.hasNext()) {
            boolean advanceMS2Selection = false;

            LCMSSpectrum spectrum = ms2Spectra.next();
            Double sTime = spectrum.getTimeVal();
            if (sTime >= tLow && sTime <= tHigh) {
                // We found a matching scan!
                MS2Collected ms2 = new MS2Collected(ms2Time, collisionEnergy, this.spectrumToYZList(spectrum));
                ms2s.add(ms2);
                advanceMS2Selection = true;
            } else if (sTime > ms2Time) {
                System.err.format(
                        "ERROR: found spectrum at time %f when searching for MS2 scan at %f, skipping MS2 scan\n",
                        sTime, ms2Time);
                advanceMS2Selection = true;
            } // Otherwise, this spectrum's time doesn't match the time point of the next relevant MS2 scan.  Skip it!

            if (advanceMS2Selection) {
                if (!selectionIterator.hasNext()) {
                    // No more relevant scans to search for.
                    break;
                }
                thisSelection = selectionIterator.next();
                ms2Time = thisSelection.getTimeVal() * 60.0d; // Assume time units are consistent across all mzML entries.
                tLow = ms2Time - TIME_TOLERANCE;
                tHigh = ms2Time + TIME_TOLERANCE;

            }
        }

        if (selectionIterator.hasNext()) {
            System.err.format(
                    "ERROR: ran out of spectra to match against MS2 scans with some scans still unmatched.\n");
        }

        return ms2s;
    }

    private Pair<Double, Double> getMaxAndNth(List<YZ> mzInt, int N) {
        if (N > mzInt.size())
            N = mzInt.size();

        List<YZ> mzIonsByInt = new ArrayList<>(mzInt);
        Collections.sort(mzIonsByInt, new Comparator<YZ>() {
            public int compare(YZ a, YZ b) {
                return b.intensity.compareTo(a.intensity);
            }
        });

        // lets normalize to the largest intensity value we have.
        Double largest = mzIonsByInt.get(0).intensity;
        Double NthLargest = mzIonsByInt.get(N - 1).intensity;

        // print out the top N peaks
        if (LOG_PEAKS_TO_STDOUT) {
            for (int i = 0; i < N; i++) {
                YZ yz = mzIonsByInt.get(i);
                System.out.format("mz: %8.4f\t intensity: %6.2f%%\n", yz.mz, 100 * yz.intensity / largest);
            }
            System.out.println("\n");
        }

        return Pair.of(largest, NthLargest);
    }

    private double extractMZ(double mzWanted, List<Pair<Double, Double>> intensities) {
        double intensityFound = 0;
        int numWithinPrecision = 0;
        double mzLowRange = mzWanted - MS1_MZ_TOLERANCE;
        double mzHighRange = mzWanted + MS1_MZ_TOLERANCE;
        // we expect there to be pretty much only one intensity value in the precision
        // range we are looking at. But if a lot of masses show up then complain
        for (Pair<Double, Double> mz_int : intensities) {
            double mz = mz_int.getLeft();
            double intensity = mz_int.getRight();

            if (mz >= mzLowRange && mz <= mzHighRange) {
                intensityFound += intensity;
                numWithinPrecision++;
            }
        }

        if (numWithinPrecision > MAX_MZ_IN_WINDOW) {
            System.out.format("Only expected %d, but found %d in the mz range [%f, %f]\n", MAX_MZ_IN_WINDOW,
                    numWithinPrecision, mzLowRange, mzHighRange);
        }

        return intensityFound;
    }

    private List<XZ> getMS1(double mz, Iterator<LCMSSpectrum> ms1Spectra) {
        List<XZ> ms1 = new ArrayList<>();

        while (ms1Spectra.hasNext()) {
            LCMSSpectrum timepoint = ms1Spectra.next();

            // get all (mz, intensity) at this timepoint
            List<Pair<Double, Double>> intensities = timepoint.getIntensities();

            // this time point is valid to look at if its max intensity is around
            // the mass we care about. So lets first get the max peak location
            double intensityForMz = extractMZ(mz, intensities);

            // the above is Pair(mz_extracted, intensity), where mz_extracted = mz
            // we now add the timepoint val and the intensity to the output
            ms1.add(new XZ(timepoint.getTimeVal(), intensityForMz));
        }

        return ms1;
    }

    private Double ms1IntensityMax(List<XZ> ms1) {
        Double maxTime = 0.0;
        Double maxIntensity = 0.0;

        for (XZ obs : ms1) {
            if (obs.intensity > maxIntensity) {
                maxIntensity = obs.intensity;
                maxTime = obs.time;
            }
        }

        return maxTime;
    }

    private MS2Collected findSpectraClosestToMS1Apex(List<MS2Collected> ms2s, Double ms1Apex) {
        MS2Collected closest = null;
        Double minDist = null;
        for (MS2Collected ms2 : ms2s) {
            Double howFar = Math.abs(ms2.triggerTime - ms1Apex);
            if (closest == null || minDist > howFar) {
                closest = ms2;
                minDist = howFar;
            }
        }
        return closest;
    }

    private static boolean areNCFiles(String[] fnames) {
        for (String n : fnames) {
            System.out.println(".nc file = " + n);
            if (!n.endsWith(".nc"))
                return false;
        }
        return true;
    }

    private MS2Collected pickMS2ClosestToMS1Peak(Double mz, String ms2mzML, String ms2nc, String ms1)
            throws Exception {

        // see where in the ms1 this mz peaks
        List<XZ> ms1Spectra = getMS1(mz, new LCMSNetCDFParser().getIterator(ms1));
        Double apexTime = ms1IntensityMax(ms1Spectra);
        System.out.format("Max in MS1 at time %8.4f\n", apexTime);

        // the first .nc is the ion trigger on the mz extracted
        List<LCMS2MZSelection> matchingScans = filterByTriggerMass(new LCMS2mzMLParser().getIterator(ms2mzML), mz);

        Iterator<LCMSSpectrum> spectrumIterator = new LCMSNetCDFParser().getIterator(ms2nc);
        List<MS2Collected> ms2Spectra = getSpectraForMatchingScans(matchingScans, spectrumIterator);
        MS2Collected ms2Main = findSpectraClosestToMS1Apex(ms2Spectra, apexTime);

        return ms2Main;
    }

    private MS2Collected normalizeAndThreshold(MS2Collected ms2) {
        Pair<Double, Double> largestAndNth = getMaxAndNth(ms2.ms2, REPORT_TOP_N);
        Double largest = largestAndNth.getLeft();
        Double nth = largestAndNth.getRight();

        List<YZ> ms2AboveThreshold = new ArrayList<>();
        // print out the spectra to outDATA
        for (YZ yz : ms2.ms2) {

            // threshold to remove everything that is not in the top peaks
            if (yz.intensity < nth)
                continue;

            ms2AboveThreshold.add(new YZ(yz.mz, 100.0 * yz.intensity / largest));
        }

        return new MS2Collected(ms2.triggerTime, ms2.voltage, ms2AboveThreshold);
    }

    private YZ getMatchingPeak(YZ toLook, List<YZ> matchAgainst) {
        Double mz = toLook.mz;
        YZ match = null;
        YZ minDistMatch = null;
        for (YZ peak : matchAgainst) {
            Double dist = Math.abs(peak.mz - mz);
            // we look for a peak that is within MS2_MZ_COMPARE_TOLERANCE of mz
            if (dist < MS2_MZ_COMPARE_TOLERANCE) {

                // this is a match, make sure it is the only match
                if (match != null) {
                    System.out.format(
                            "SEVERE: MS2_MZ_COMPARE_TOLERANCE might be too wide. MS2 peak %.4f has >1 matches.\n"
                                    + "\tMatch 1: %.4f\t Match 2: %.4f\n",
                            mz, match.mz, peak.mz);
                }

                match = peak;
            }

            // bookkeeping for how close it got, in case no matches within precision
            if (minDistMatch == null || Math.abs(minDistMatch.mz - mz) > dist) {
                minDistMatch = peak;
            }
        }
        if (match != null) {
            System.out.format("Peak %8.4f (%6.2f%%) - MATCHES -    PEAK: %8.4f (%6.2f%%) at DISTANCE: %.5f\n", mz,
                    toLook.intensity, match.mz, match.intensity, Math.abs(match.mz - mz));
        } else {
            System.out.format("Peak %8.4f (%6.2f%%) - NO MTCH - CLOSEST: %8.4f (%6.2f%%) at DISTANCE: %.5f\n", mz,
                    toLook.intensity, minDistMatch.mz, minDistMatch.intensity, Math.abs(minDistMatch.mz - mz));
        }
        return match;
    }

    private Double weightedMatch(MS2Collected A, MS2Collected B) {
        Double weightedSum = 0.0;

        // we should go through the peaks in descending order of intensity
        // so that get reported to the output in that order
        List<YZ> orderedBms2 = new ArrayList<>(B.ms2);
        Collections.sort(orderedBms2, new Comparator<YZ>() {
            public int compare(YZ a, YZ b) {
                return b.intensity.compareTo(a.intensity);
            }
        });

        // once a peak is matched, we should remove it from the available
        // set to be matched further
        List<YZ> toMatch = new ArrayList<>(A.ms2);

        for (YZ peak : orderedBms2) {
            YZ matchInA = getMatchingPeak(peak, toMatch);
            if (matchInA != null) {
                // this YZ peak in B has a match `matchInA` in A's MS2 peaks
                // if the aggregate peak across both spectra is high, we give it a
                // high score; by weighing it with the intensity percentage
                Double intensityPc = (peak.intensity + matchInA.intensity) / 2.0;
                // scale it back to [0,1] from [0,100]%
                Double intensity = intensityPc / 100;

                weightedSum += intensity;
                toMatch.remove(matchInA);
            }
        }

        return weightedSum;

    }

    private boolean doMatch(MS2Collected A, MS2Collected B) {
        Double weightedPeakMatch = weightedMatch(A, B);

        System.out.format("Weighted peak match: %.2f >= %.2f\n", weightedPeakMatch, THRESHOLD_WEIGHTED_PEAK);
        boolean isMatch = weightedPeakMatch >= THRESHOLD_WEIGHTED_PEAK;

        return isMatch;
    }

    private boolean doesMS2Match(Double mz, String Ams2mzML, String Ams2nc, String Ams1, String Bms2mzML,
            String Bms2nc, String Bms1, String outPrefix, String fmt) throws IOException {
        MS2Collected Ams2Peaks = null, Bms2Peaks = null;

        try {
            Ams2Peaks = normalizeAndThreshold(pickMS2ClosestToMS1Peak(mz, Ams2mzML, Ams2nc, Ams1));
            System.out.println("Standard: MS2 fragmentation trigged on " + mz);
        } catch (Exception e) {
            System.out.println("Standard: " + e.getMessage());
        }

        try {
            Bms2Peaks = normalizeAndThreshold(pickMS2ClosestToMS1Peak(mz, Bms2mzML, Bms2nc, Bms1));
            System.out.println("Sample: MS2 fragmentation trigged on " + mz);
        } catch (Exception e) {
            System.out.println("Sample: " + e.getMessage());
        }

        boolean isMatch = false;

        if (Ams2Peaks != null && Bms2Peaks != null) {
            isMatch = doMatch(Ams2Peaks, Bms2Peaks);

            // plotting can throw an io exception
            plot(Ams2Peaks, Bms2Peaks, mz, outPrefix, fmt);
        }

        return isMatch;
    }

    private void plot(MS2Collected Ams2Peaks, MS2Collected Bms2Peaks, Double mz, String outPrefix, String fmt)
            throws IOException {

        MS2Collected[] ms2Spectra = new MS2Collected[] { Ams2Peaks, Bms2Peaks };

        String outPDF = outPrefix + "." + fmt;
        String outDATA = outPrefix + ".data";

        // Write data output to outfile
        PrintStream out = new PrintStream(new FileOutputStream(outDATA));

        List<String> plotID = new ArrayList<>(ms2Spectra.length);
        for (MS2Collected yzSlice : ms2Spectra) {
            plotID.add(String.format("time: %.4f, volts: %.4f", yzSlice.triggerTime, yzSlice.voltage));
            // print out the spectra to outDATA
            for (YZ yz : yzSlice.ms2) {
                out.format("%.4f\t%.4f\n", yz.mz, yz.intensity);
                out.flush();
            }
            // delimit this dataset from the rest
            out.print("\n\n");
        }

        // close the .data
        out.close();

        // render outDATA to outPDF using gnuplot
        // 105.0 here means 105% for the y-range of a [0%:100%] plot. We want to leave some buffer space at
        // at the top, and hence we go a little outside of the 100% max range.
        new Gnuplotter().plot2DImpulsesWithLabels(outDATA, outPDF, plotID.toArray(new String[plotID.size()]),
                mz + 50.0, "mz", 105.0, "intensity (%)", fmt);
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 8 || !areNCFiles(new String[] { args[3], args[4] })
                || !areNCFiles(new String[] { args[6], args[7] })) {
            throw new RuntimeException("Needs: \n" + "(1) mz for main product, e.g., 431.1341983 (ononin) \n"
                    + "(2) prefix for .data and rendered .pdf \n"
                    + "(3) STD: mzML file from MS2 run (to extract trigger masses) \n"
                    + "(4) STD: NetCDF .nc file 02.nc from MSMS run \n"
                    + "(5) STD: NetCDF .nc file 01.nc from MS1 run \n"
                    + "(6) YST: mzML file from MS2 run (to extract trigger masses) \n"
                    + "(7) YST: NetCDF .nc file 02.nc from MSMS run \n"
                    + "(8) YST: NetCDF .nc file 01.nc from MS1 run");
        }

        String fmt = "pdf";
        Double mz = Double.parseDouble(args[0]);
        String outPrefix = args[1];
        String Ams2mzml = args[2];
        String Ams2nc = args[3];
        String Ams1 = args[4];
        String Bms2mzml = args[5];
        String Bms2nc = args[6];
        String Bms1 = args[7];

        MS2 c = new MS2();
        boolean areMatch = c.doesMS2Match(mz, Ams2mzml, Ams2nc, Ams1, Bms2mzml, Bms2nc, Bms1, outPrefix, fmt);
        System.out.format("MS2 spectra match: %s\n", (areMatch ? "YES" : "NO"));
    }
}