com.act.lcms.MS2Simple.java Source code

Java tutorial

Introduction

Here is the source code for com.act.lcms.MS2Simple.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms;

import com.act.lcms.db.io.LoadPlateCompositionIntoDB;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class MS2Simple {

    public static final String OPTION_OUTPUT_PREFIX = "o";
    public static final String OPTION_TARGET_TRIGGER_MZ = "t";
    public static final String OPTION_MZML_FILE = "x";
    public static final String OPTION_NETCDF_FILE = "n";
    public static final String OPTION_MS2_PEAK_SEARCH_MASSES = "m";
    public static final String OPTION_PICK_TOP_N_MATCHES = "p";

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "Plots MS2 scans whose trigger (isolation target) mass/charge matches some specified value. ",
            "Accepts an optional list of m/z values that will be used to filter MS2 scans by peak intensity. ", },
            "");
    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
    static {
        HELP_FORMATTER.setWidth(100);
    }

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_OUTPUT_PREFIX).argName("output prefix")
                    .desc("A prefix for the output data/pdf files").hasArg().required().longOpt("output-prefix"));
            add(Option.builder(OPTION_TARGET_TRIGGER_MZ).argName("trigger mass").desc(
                    "The trigger (isolation) m/z value to use when extracting MS2 scans (e.g. 431.1341983 [ononin])")
                    .hasArg().required().longOpt("trigger-mass"));
            add(Option.builder(OPTION_MZML_FILE).argName("mzML file")
                    .desc("The mzML file to scan for trigger masses and collision voltages").hasArg().required()
                    .longOpt("mzml-file"));
            add(Option.builder(OPTION_NETCDF_FILE).argName("NetCDF file")
                    .desc("The MS2 NetCDF (*02.nc) file containing scan data to read/plot").hasArg().required()
                    .longOpt("netcdf-file"));
            add(Option.builder(OPTION_MS2_PEAK_SEARCH_MASSES).argName("Peak search masses").desc(
                    "(Optional) An ordered, comma separated list of m/z values to use when selecting MS2 scans to plot; "
                            + "plots all by default")
                    .hasArgs().valueSeparator(',').longOpt("ms2-search-mzs"));
            add(Option.builder(OPTION_PICK_TOP_N_MATCHES) // No short option for top-n.
                    .argName("Top N matches")
                    .desc("(Optional) Only plot the N best matches, ordered by numbered of matching peaks and time; "
                            + "only applied if MS2 search m/z values are specified")
                    .hasArg().longOpt("top-n"));

            // Everybody needs a little help from their friends.
            add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help"));
        }
    };

    class YZ {
        Double mz;
        Double intensity;

        public YZ(Double mz, Double intensity) {
            this.mz = mz;
            this.intensity = intensity;
        }
    }

    class MS2Collected {
        Double triggerMass;
        Double triggerTime;
        Double voltage;
        List<YZ> ms2;

        public MS2Collected(Double triggerMass, Double trigTime, Double collisionEv, List<YZ> ms2) {
            this.triggerMass = triggerMass;
            this.triggerTime = trigTime;
            this.ms2 = ms2;
            this.voltage = collisionEv;
        }
    }

    // Rounding upto 2 decimal places should result in pretty 
    // much an identical match on mz in the MS2 spectra
    final static Double MS2_MZ_COMPARE_TOLERANCE = 0.005;

    /* In the MS1 case, we look for a window that we think is within the
     * tolerance of the instrument.  This should also be set on the
     * instrument when doing MS2 runs.  This value attempts to balance
     * meaningful signal detection with noise elimination. */
    final static Double MS1_MZ_TOLERANCE = 0.01;

    // Use the time window to only identify identical scans
    // hence the infinitely small window
    final static Double TIME_TOLERANCE = 0.1 / 1e3d;

    /* Only count peaks that have a non-zero intensity value in case the scan contains a matching (m/z, intensity) pair
     * with a near-zero intensity. */
    final static Double THRESHOLD_SEARCH_MZ_MS2_PEAK_INTENSITY = 0.01;

    private List<LCMS2MZSelection> filterByTriggerMass(Iterator<LCMS2MZSelection> ms2Scans, Double targetMass) {
        Double mzLow = targetMass - MS1_MZ_TOLERANCE;
        Double mzHigh = targetMass + MS1_MZ_TOLERANCE;

        List<LCMS2MZSelection> relevantMS2Scans = new ArrayList<>();
        while (ms2Scans.hasNext()) {
            LCMS2MZSelection scan = ms2Scans.next();
            Double targetWindow = scan.getIsolationWindowTargetMZ();
            if (targetWindow >= mzLow && targetWindow <= mzHigh) {
                relevantMS2Scans.add(scan);
            }
        }

        if (relevantMS2Scans.size() < 1) {
            String errmsg = String.format("SEVERE ERR: found no matching MS2 scans for MS1 target mass %f",
                    targetMass);
            throw new RuntimeException(errmsg);
        }

        return relevantMS2Scans;
    }

    List<YZ> spectrumToYZList(LCMSSpectrum spectrum) {
        List<YZ> yzList = new ArrayList<>(spectrum.getIntensities().size());
        for (Pair<Double, Double> p : spectrum.getIntensities()) {
            yzList.add(new YZ(p.getLeft(), p.getRight()));
        }
        return yzList;
    }

    /* This is a helper function to `findPeaksTriggeredByMZ`. It translates the selected trigger times
     * from the mzML files into scans extracted from the NetCDF files. Trigger times from mzML come
     * in as `minute`s that we convert to seconds, and then look for a scan in the NetCDF file that is
     * infinitely close (TIME_TOLERANCE) to that trigger time. */
    List<MS2Collected> getSpectraForMatchingScans(List<LCMS2MZSelection> relevantMS2Selections,
            Iterator<LCMSSpectrum> ms2Spectra) {
        List<MS2Collected> ms2s = new ArrayList<>();

        Iterator<LCMS2MZSelection> selectionIterator = relevantMS2Selections.iterator();
        if (!selectionIterator.hasNext()) {
            // Previous checks should have prevented this.
            throw new RuntimeException("No scans available for spectrum matching");
        }
        LCMS2MZSelection thisSelection = selectionIterator.next();
        // TODO: handle other time units more gracefully.
        if (!"minute".equals(thisSelection.getTimeUnit())) {
            throw new RuntimeException(
                    String.format("Expected 'minute' for MS2 scan selection time unit, but found '%s'",
                            thisSelection.getTimeUnit()));
        }
        Double ms2Time = thisSelection.getTimeVal() * 60.0d; // mzML times tend to be in minutes;
        Double collisionEnergy = thisSelection.getCollisionEnergy(); // assumed in electronvols
        Double tLow = ms2Time - TIME_TOLERANCE;
        Double tHigh = ms2Time + TIME_TOLERANCE;

        while (ms2Spectra.hasNext()) {
            boolean advanceMS2Selection = false;

            LCMSSpectrum spectrum = ms2Spectra.next();
            Double sTime = spectrum.getTimeVal();
            if (sTime >= tLow && sTime <= tHigh) {
                // We found a matching scan!
                MS2Collected ms2 = new MS2Collected(thisSelection.getIsolationWindowTargetMZ(), ms2Time,
                        collisionEnergy, this.spectrumToYZList(spectrum));
                ms2s.add(ms2);
                advanceMS2Selection = true;
            } else if (sTime > ms2Time) {
                System.err.format(
                        "ERROR: found spectrum at time %f when searching for MS2 scan at %f, skipping MS2 scan\n",
                        sTime, ms2Time);
                advanceMS2Selection = true;
            } // Otherwise, this spectrum's time doesn't match the time point of the next relevant MS2 scan.  Skip it!

            if (advanceMS2Selection) {
                if (!selectionIterator.hasNext()) {
                    // No more relevant scans to search for.
                    break;
                }
                thisSelection = selectionIterator.next();
                ms2Time = thisSelection.getTimeVal() * 60.0d; // Assume time units are consistent across all mzML entries.
                tLow = ms2Time - TIME_TOLERANCE;
                tHigh = ms2Time + TIME_TOLERANCE;

            }
        }

        if (selectionIterator.hasNext()) {
            System.err.format(
                    "ERROR: ran out of spectra to match against MS2 scans with some scans still unmatched.\n");
        }

        return ms2s;
    }

    /* We are looking for scans triggered by a single `mz` value of interest. The entire MS2 run may
     * contain many trigger events (on different masses possibly--the instrument can be parametrized
     * to trigger on an arbitrary number of trigger masses). Therefore we need to filter out the ones
     * that triggered on our analysis mass here. This function does that. */
    private List<MS2Collected> findPeaksTriggeredByMZ(Double mz, String ms2mzML, String ms2nc) throws Exception {

        // the first .nc is the ion trigger on the mz extracted
        List<LCMS2MZSelection> matchingScans = filterByTriggerMass(new LCMS2mzMLParser().getIterator(ms2mzML), mz);

        Iterator<LCMSSpectrum> spectrumIterator = new LCMSNetCDFParser().getIterator(ms2nc);
        return getSpectraForMatchingScans(matchingScans, spectrumIterator);
    }

    private YZ getMatchingPeak(Double searchMz, List<YZ> matchAgainst) {
        YZ match = null;
        YZ minDistMatch = null;
        for (YZ peak : matchAgainst) {
            Double dist = Math.abs(peak.mz - searchMz);
            // we look for a peak that is within MS2_MZ_COMPARE_TOLERANCE of mz
            if (dist <= MS2_MZ_COMPARE_TOLERANCE) {

                // this is a match, make sure it is the only match
                if (match != null) {
                    System.out.format(
                            "SEVERE: MS2_MZ_COMPARE_TOLERANCE might be too wide. MS2 peak %.4f has >1 matches.\n"
                                    + "\tMatch 1: %.4f\t Match 2: %.4f\n",
                            searchMz, match.mz, peak.mz);
                }

                match = peak;
            }

            // bookkeeping for how close it got, in case no matches within precision
            if (minDistMatch == null || Math.abs(minDistMatch.mz - searchMz) > dist) {
                minDistMatch = peak;
            }
        }
        if (match != null) {
            System.out.format("Peak %8.4f - MATCHES -    PEAK: %8.4f (%6.2f%%) at DISTANCE: %.5f\n", searchMz,
                    match.mz, match.intensity, Math.abs(match.mz - searchMz));
        } else {
            System.out.format("Peak %8.4f - NO MTCH - CLOSEST: %8.4f (%6.2f%%) at DISTANCE: %.5f\n", searchMz,
                    minDistMatch.mz, minDistMatch.intensity, Math.abs(minDistMatch.mz - searchMz));
        }
        return match;
    }

    /* This is the gatekeeper function deciding whether the given scan matches the fragmentation pattern
     * we expect. The expected fragmentation pattern is specified as a list (ideally going from the most
     * prominent peak downwards) of mz values. Most likely that set of peaks is going to come from
     * reference MS2 patterns (e.g., METLIN or runs of the standards). */
    private Integer searchForMatchingPeaks(List<Double> searchMzs, MS2Collected scan) {
        // once a peak is matched, we should remove it from the available
        Map<Double, Double> matchingPeakIntensities = new HashMap<>();
        for (Double mz : searchMzs) {

            YZ matchInA = getMatchingPeak(mz, scan.ms2);
            if (matchInA != null) {
                // Compare the scan intensity with a tiny threshold to ignore ultra-low-intensity noise.
                if (matchInA.intensity > THRESHOLD_SEARCH_MZ_MS2_PEAK_INTENSITY) {
                    // Got a match!
                    matchingPeakIntensities.put(mz, matchInA.intensity);
                } // otherwise don't count it as a match.
            }
        }

        // TODO: 
        // 1. do better result reporting
        // 2. consider matches based on an order-weighted score or something.
        //    See https://github.com/20n/act/pull/155#issuecomment-183165867
        //    for data and discussion on improvements to matching.

        // Return the number of peaks that were found to match the search m/z values.
        return matchingPeakIntensities.size();
    }

    private List<Pair<MS2Collected, Integer>> filterByMS2PeakMatch(List<Double> ms2SearchMZs,
            List<MS2Collected> scans, Integer pickTopN) {
        List<Pair<MS2Collected, Integer>> results = new ArrayList<>();
        for (MS2Collected scan : scans) {
            Integer matchCount = searchForMatchingPeaks(ms2SearchMZs, scan);
            if (matchCount > 0) {
                results.add(Pair.of(scan, matchCount));
            }
        }

        Collections.sort(results, new Comparator<Pair<MS2Collected, Integer>>() {
            @Override
            public int compare(Pair<MS2Collected, Integer> o1, Pair<MS2Collected, Integer> o2) {
                if (!o1.getRight().equals(o2.getRight())) {
                    // Sort in descending order of match count first.
                    return o2.getRight().compareTo(o1.getRight());
                }

                // Fall back to sorting on trigger time (in ascending order) to enforce output stability/reproducability.
                return o1.getLeft().triggerTime.compareTo(o2.getLeft().triggerTime);
            }
        });

        if (pickTopN != null && pickTopN > 0 && pickTopN < results.size()) {
            return results.subList(0, pickTopN);
        }
        return results;
    }

    /* This wrapper function performs the business steps of this class:
     * (1) finds scans triggered by mass we care about
     * (2) filters them to only those scans which have the MS2 peaks we expect (optional, only done if MS2 peaks provided)
     * (3) plots the ms2 scans that survive */
    private void findAndPlotMatchingMS2Scans(Double mz, List<Double> ms2SearchMZs, Integer pickTopN, String ms2mzML,
            String ms2nc, String outPrefix, String fmt) throws IOException {
        List<MS2Collected> ms2Peaks = null;

        try {
            ms2Peaks = findPeaksTriggeredByMZ(mz, ms2mzML, ms2nc);
        } catch (Exception e) {
            System.err.format("Caught exception when finding triggered MS2 scans: %s\n", e.getMessage());
        }

        List<Pair<MS2Collected, Integer>> peakCountPairs = null;
        if (ms2SearchMZs.size() > 0) {
            peakCountPairs = filterByMS2PeakMatch(ms2SearchMZs, ms2Peaks, pickTopN);
        } else if (ms2Peaks != null) {
            // Maps List of MS2 scans at trigger mass -> List of Pair(MS2 scans, null) when no ms2 Peaks are specified.
            peakCountPairs = ms2Peaks.stream().map(ms2 -> Pair.of(ms2, (Integer) null))
                    .collect(Collectors.toList());
        }

        plot(peakCountPairs, mz, ms2SearchMZs, outPrefix, fmt);
    }

    private void plot(List<Pair<MS2Collected, Integer>> ms2Spectra, Double mz, List<Double> ms2SearchMZs,
            String outPrefix, String fmt) throws IOException {
        String outPDF = outPrefix + "." + fmt;
        String outDATA = outPrefix + ".data";

        // Write data output to outfile
        PrintStream out = new PrintStream(new FileOutputStream(outDATA));

        List<String> plotID = new ArrayList<>(ms2Spectra.size());
        for (Pair<MS2Collected, Integer> pair : ms2Spectra) {
            MS2Collected yzSlice = pair.getLeft();
            String caption;
            if (ms2SearchMZs != null && ms2SearchMZs.size() > 0) {
                caption = String.format("target: %.4f, time: %.4f, volts: %.4f, %d / %d matches",
                        yzSlice.triggerMass, yzSlice.triggerTime, yzSlice.voltage,
                        pair.getRight() == null ? 0 : pair.getRight(), ms2SearchMZs.size());
            } else {
                caption = String.format("target: %.4f, time: %.4f, volts: %.4f", yzSlice.triggerMass,
                        yzSlice.triggerTime, yzSlice.voltage);
            }
            plotID.add(caption);
            // Compute the total intensity on the fly so we can plot on a percentage scale.
            double maxIntensity = 0.0;
            for (YZ yz : yzSlice.ms2) {
                if (yz.intensity > maxIntensity) {
                    maxIntensity = yz.intensity;
                }
            }
            // print out the spectra to outDATA
            for (YZ yz : yzSlice.ms2) {
                out.format("%.4f\t%.4f\n", yz.mz, 100.0 * yz.intensity / maxIntensity);
                out.flush();
            }
            // delimit this dataset from the rest
            out.print("\n\n");
        }

        // close the .data
        out.close();

        // render outDATA to outPDF using gnuplot
        // 105.0 here means 105% for the y-range of a [0%:100%] plot. We want to leave some buffer space at
        // at the top, and hence we go a little outside of the 100% max range.
        /* We intend to plot the fragmentation pattern, and so do not expect to see fragments that are bigger than the
         * original selected molecule.  We truncate the x-range to the specified m/z but since that will make values close
         * to the max hard to see we add a buffer and truncate the plot in the x-range to m/z + 50.0. */
        new Gnuplotter().plot2DImpulsesWithLabels(outDATA, outPDF, plotID.toArray(new String[plotID.size()]),
                mz + 50.0, "mz", 105.0, "intensity (%)", fmt);
    }

    public static void main(String[] args) throws Exception {
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                    true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                    true);
            return;
        }

        String fmt = "pdf";
        Double mz;
        try {
            mz = Double.parseDouble(cl.getOptionValue(OPTION_TARGET_TRIGGER_MZ));
        } catch (NumberFormatException e) {
            System.err.format("Trigger mass must be a floating point number: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                    true);
            System.exit(1);
            return; // Silences compiler warnings for `mz`.
        }

        String outPrefix = cl.getOptionValue(OPTION_OUTPUT_PREFIX);
        String ms2mzml = cl.getOptionValue(OPTION_MZML_FILE);
        String ms2nc = cl.getOptionValue(OPTION_NETCDF_FILE);

        String[] ms2SearchMassStrings = cl.getOptionValues(OPTION_MS2_PEAK_SEARCH_MASSES);
        List<Double> ms2SearchMasses;
        if (ms2SearchMassStrings != null && ms2SearchMassStrings.length > 0) {
            ms2SearchMasses = new ArrayList<>(ms2SearchMassStrings.length);
            for (String m : ms2SearchMassStrings) {
                Double d;
                try {
                    d = Double.parseDouble(m);
                } catch (NumberFormatException e) {
                    System.err.format(
                            "MS2 search mass must be a comma separated list of floating point numbers: %s\n",
                            e.getMessage());
                    HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE,
                            opts, null, true);
                    System.exit(1);
                    return; // Silences compiler warnings for `d`.
                }
                ms2SearchMasses.add(d);
            }
        } else {
            ms2SearchMasses = new ArrayList<>(0); // Use an empty array rather than null for easier logic later.
        }

        Integer pickTopNMatches = null;
        if (cl.hasOption(OPTION_PICK_TOP_N_MATCHES)) {
            pickTopNMatches = Integer.valueOf(cl.getOptionValue(OPTION_PICK_TOP_N_MATCHES));
        }

        MS2Simple c = new MS2Simple();
        c.findAndPlotMatchingMS2Scans(mz, ms2SearchMasses, pickTopNMatches, ms2mzml, ms2nc, outPrefix, fmt);
    }
}