Java tutorial
/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.lcms; import com.act.lcms.db.io.LoadPlateCompositionIntoDB; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; public class MS2Simple { public static final String OPTION_OUTPUT_PREFIX = "o"; public static final String OPTION_TARGET_TRIGGER_MZ = "t"; public static final String OPTION_MZML_FILE = "x"; public static final String OPTION_NETCDF_FILE = "n"; public static final String OPTION_MS2_PEAK_SEARCH_MASSES = "m"; public static final String OPTION_PICK_TOP_N_MATCHES = "p"; public static final String HELP_MESSAGE = StringUtils.join(new String[] { "Plots MS2 scans whose trigger (isolation target) mass/charge matches some specified value. ", "Accepts an optional list of m/z values that will be used to filter MS2 scans by peak intensity. ", }, ""); public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() { { add(Option.builder(OPTION_OUTPUT_PREFIX).argName("output prefix") .desc("A prefix for the output data/pdf files").hasArg().required().longOpt("output-prefix")); add(Option.builder(OPTION_TARGET_TRIGGER_MZ).argName("trigger mass").desc( "The trigger (isolation) m/z value to use when extracting MS2 scans (e.g. 431.1341983 [ononin])") .hasArg().required().longOpt("trigger-mass")); add(Option.builder(OPTION_MZML_FILE).argName("mzML file") .desc("The mzML file to scan for trigger masses and collision voltages").hasArg().required() .longOpt("mzml-file")); add(Option.builder(OPTION_NETCDF_FILE).argName("NetCDF file") .desc("The MS2 NetCDF (*02.nc) file containing scan data to read/plot").hasArg().required() .longOpt("netcdf-file")); add(Option.builder(OPTION_MS2_PEAK_SEARCH_MASSES).argName("Peak search masses").desc( "(Optional) An ordered, comma separated list of m/z values to use when selecting MS2 scans to plot; " + "plots all by default") .hasArgs().valueSeparator(',').longOpt("ms2-search-mzs")); add(Option.builder(OPTION_PICK_TOP_N_MATCHES) // No short option for top-n. .argName("Top N matches") .desc("(Optional) Only plot the N best matches, ordered by numbered of matching peaks and time; " + "only applied if MS2 search m/z values are specified") .hasArg().longOpt("top-n")); // Everybody needs a little help from their friends. add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help")); } }; class YZ { Double mz; Double intensity; public YZ(Double mz, Double intensity) { this.mz = mz; this.intensity = intensity; } } class MS2Collected { Double triggerMass; Double triggerTime; Double voltage; List<YZ> ms2; public MS2Collected(Double triggerMass, Double trigTime, Double collisionEv, List<YZ> ms2) { this.triggerMass = triggerMass; this.triggerTime = trigTime; this.ms2 = ms2; this.voltage = collisionEv; } } // Rounding upto 2 decimal places should result in pretty // much an identical match on mz in the MS2 spectra final static Double MS2_MZ_COMPARE_TOLERANCE = 0.005; /* In the MS1 case, we look for a window that we think is within the * tolerance of the instrument. This should also be set on the * instrument when doing MS2 runs. This value attempts to balance * meaningful signal detection with noise elimination. */ final static Double MS1_MZ_TOLERANCE = 0.01; // Use the time window to only identify identical scans // hence the infinitely small window final static Double TIME_TOLERANCE = 0.1 / 1e3d; /* Only count peaks that have a non-zero intensity value in case the scan contains a matching (m/z, intensity) pair * with a near-zero intensity. */ final static Double THRESHOLD_SEARCH_MZ_MS2_PEAK_INTENSITY = 0.01; private List<LCMS2MZSelection> filterByTriggerMass(Iterator<LCMS2MZSelection> ms2Scans, Double targetMass) { Double mzLow = targetMass - MS1_MZ_TOLERANCE; Double mzHigh = targetMass + MS1_MZ_TOLERANCE; List<LCMS2MZSelection> relevantMS2Scans = new ArrayList<>(); while (ms2Scans.hasNext()) { LCMS2MZSelection scan = ms2Scans.next(); Double targetWindow = scan.getIsolationWindowTargetMZ(); if (targetWindow >= mzLow && targetWindow <= mzHigh) { relevantMS2Scans.add(scan); } } if (relevantMS2Scans.size() < 1) { String errmsg = String.format("SEVERE ERR: found no matching MS2 scans for MS1 target mass %f", targetMass); throw new RuntimeException(errmsg); } return relevantMS2Scans; } List<YZ> spectrumToYZList(LCMSSpectrum spectrum) { List<YZ> yzList = new ArrayList<>(spectrum.getIntensities().size()); for (Pair<Double, Double> p : spectrum.getIntensities()) { yzList.add(new YZ(p.getLeft(), p.getRight())); } return yzList; } /* This is a helper function to `findPeaksTriggeredByMZ`. It translates the selected trigger times * from the mzML files into scans extracted from the NetCDF files. Trigger times from mzML come * in as `minute`s that we convert to seconds, and then look for a scan in the NetCDF file that is * infinitely close (TIME_TOLERANCE) to that trigger time. */ List<MS2Collected> getSpectraForMatchingScans(List<LCMS2MZSelection> relevantMS2Selections, Iterator<LCMSSpectrum> ms2Spectra) { List<MS2Collected> ms2s = new ArrayList<>(); Iterator<LCMS2MZSelection> selectionIterator = relevantMS2Selections.iterator(); if (!selectionIterator.hasNext()) { // Previous checks should have prevented this. throw new RuntimeException("No scans available for spectrum matching"); } LCMS2MZSelection thisSelection = selectionIterator.next(); // TODO: handle other time units more gracefully. if (!"minute".equals(thisSelection.getTimeUnit())) { throw new RuntimeException( String.format("Expected 'minute' for MS2 scan selection time unit, but found '%s'", thisSelection.getTimeUnit())); } Double ms2Time = thisSelection.getTimeVal() * 60.0d; // mzML times tend to be in minutes; Double collisionEnergy = thisSelection.getCollisionEnergy(); // assumed in electronvols Double tLow = ms2Time - TIME_TOLERANCE; Double tHigh = ms2Time + TIME_TOLERANCE; while (ms2Spectra.hasNext()) { boolean advanceMS2Selection = false; LCMSSpectrum spectrum = ms2Spectra.next(); Double sTime = spectrum.getTimeVal(); if (sTime >= tLow && sTime <= tHigh) { // We found a matching scan! MS2Collected ms2 = new MS2Collected(thisSelection.getIsolationWindowTargetMZ(), ms2Time, collisionEnergy, this.spectrumToYZList(spectrum)); ms2s.add(ms2); advanceMS2Selection = true; } else if (sTime > ms2Time) { System.err.format( "ERROR: found spectrum at time %f when searching for MS2 scan at %f, skipping MS2 scan\n", sTime, ms2Time); advanceMS2Selection = true; } // Otherwise, this spectrum's time doesn't match the time point of the next relevant MS2 scan. Skip it! if (advanceMS2Selection) { if (!selectionIterator.hasNext()) { // No more relevant scans to search for. break; } thisSelection = selectionIterator.next(); ms2Time = thisSelection.getTimeVal() * 60.0d; // Assume time units are consistent across all mzML entries. tLow = ms2Time - TIME_TOLERANCE; tHigh = ms2Time + TIME_TOLERANCE; } } if (selectionIterator.hasNext()) { System.err.format( "ERROR: ran out of spectra to match against MS2 scans with some scans still unmatched.\n"); } return ms2s; } /* We are looking for scans triggered by a single `mz` value of interest. The entire MS2 run may * contain many trigger events (on different masses possibly--the instrument can be parametrized * to trigger on an arbitrary number of trigger masses). Therefore we need to filter out the ones * that triggered on our analysis mass here. This function does that. */ private List<MS2Collected> findPeaksTriggeredByMZ(Double mz, String ms2mzML, String ms2nc) throws Exception { // the first .nc is the ion trigger on the mz extracted List<LCMS2MZSelection> matchingScans = filterByTriggerMass(new LCMS2mzMLParser().getIterator(ms2mzML), mz); Iterator<LCMSSpectrum> spectrumIterator = new LCMSNetCDFParser().getIterator(ms2nc); return getSpectraForMatchingScans(matchingScans, spectrumIterator); } private YZ getMatchingPeak(Double searchMz, List<YZ> matchAgainst) { YZ match = null; YZ minDistMatch = null; for (YZ peak : matchAgainst) { Double dist = Math.abs(peak.mz - searchMz); // we look for a peak that is within MS2_MZ_COMPARE_TOLERANCE of mz if (dist <= MS2_MZ_COMPARE_TOLERANCE) { // this is a match, make sure it is the only match if (match != null) { System.out.format( "SEVERE: MS2_MZ_COMPARE_TOLERANCE might be too wide. MS2 peak %.4f has >1 matches.\n" + "\tMatch 1: %.4f\t Match 2: %.4f\n", searchMz, match.mz, peak.mz); } match = peak; } // bookkeeping for how close it got, in case no matches within precision if (minDistMatch == null || Math.abs(minDistMatch.mz - searchMz) > dist) { minDistMatch = peak; } } if (match != null) { System.out.format("Peak %8.4f - MATCHES - PEAK: %8.4f (%6.2f%%) at DISTANCE: %.5f\n", searchMz, match.mz, match.intensity, Math.abs(match.mz - searchMz)); } else { System.out.format("Peak %8.4f - NO MTCH - CLOSEST: %8.4f (%6.2f%%) at DISTANCE: %.5f\n", searchMz, minDistMatch.mz, minDistMatch.intensity, Math.abs(minDistMatch.mz - searchMz)); } return match; } /* This is the gatekeeper function deciding whether the given scan matches the fragmentation pattern * we expect. The expected fragmentation pattern is specified as a list (ideally going from the most * prominent peak downwards) of mz values. Most likely that set of peaks is going to come from * reference MS2 patterns (e.g., METLIN or runs of the standards). */ private Integer searchForMatchingPeaks(List<Double> searchMzs, MS2Collected scan) { // once a peak is matched, we should remove it from the available Map<Double, Double> matchingPeakIntensities = new HashMap<>(); for (Double mz : searchMzs) { YZ matchInA = getMatchingPeak(mz, scan.ms2); if (matchInA != null) { // Compare the scan intensity with a tiny threshold to ignore ultra-low-intensity noise. if (matchInA.intensity > THRESHOLD_SEARCH_MZ_MS2_PEAK_INTENSITY) { // Got a match! matchingPeakIntensities.put(mz, matchInA.intensity); } // otherwise don't count it as a match. } } // TODO: // 1. do better result reporting // 2. consider matches based on an order-weighted score or something. // See https://github.com/20n/act/pull/155#issuecomment-183165867 // for data and discussion on improvements to matching. // Return the number of peaks that were found to match the search m/z values. return matchingPeakIntensities.size(); } private List<Pair<MS2Collected, Integer>> filterByMS2PeakMatch(List<Double> ms2SearchMZs, List<MS2Collected> scans, Integer pickTopN) { List<Pair<MS2Collected, Integer>> results = new ArrayList<>(); for (MS2Collected scan : scans) { Integer matchCount = searchForMatchingPeaks(ms2SearchMZs, scan); if (matchCount > 0) { results.add(Pair.of(scan, matchCount)); } } Collections.sort(results, new Comparator<Pair<MS2Collected, Integer>>() { @Override public int compare(Pair<MS2Collected, Integer> o1, Pair<MS2Collected, Integer> o2) { if (!o1.getRight().equals(o2.getRight())) { // Sort in descending order of match count first. return o2.getRight().compareTo(o1.getRight()); } // Fall back to sorting on trigger time (in ascending order) to enforce output stability/reproducability. return o1.getLeft().triggerTime.compareTo(o2.getLeft().triggerTime); } }); if (pickTopN != null && pickTopN > 0 && pickTopN < results.size()) { return results.subList(0, pickTopN); } return results; } /* This wrapper function performs the business steps of this class: * (1) finds scans triggered by mass we care about * (2) filters them to only those scans which have the MS2 peaks we expect (optional, only done if MS2 peaks provided) * (3) plots the ms2 scans that survive */ private void findAndPlotMatchingMS2Scans(Double mz, List<Double> ms2SearchMZs, Integer pickTopN, String ms2mzML, String ms2nc, String outPrefix, String fmt) throws IOException { List<MS2Collected> ms2Peaks = null; try { ms2Peaks = findPeaksTriggeredByMZ(mz, ms2mzML, ms2nc); } catch (Exception e) { System.err.format("Caught exception when finding triggered MS2 scans: %s\n", e.getMessage()); } List<Pair<MS2Collected, Integer>> peakCountPairs = null; if (ms2SearchMZs.size() > 0) { peakCountPairs = filterByMS2PeakMatch(ms2SearchMZs, ms2Peaks, pickTopN); } else if (ms2Peaks != null) { // Maps List of MS2 scans at trigger mass -> List of Pair(MS2 scans, null) when no ms2 Peaks are specified. peakCountPairs = ms2Peaks.stream().map(ms2 -> Pair.of(ms2, (Integer) null)) .collect(Collectors.toList()); } plot(peakCountPairs, mz, ms2SearchMZs, outPrefix, fmt); } private void plot(List<Pair<MS2Collected, Integer>> ms2Spectra, Double mz, List<Double> ms2SearchMZs, String outPrefix, String fmt) throws IOException { String outPDF = outPrefix + "." + fmt; String outDATA = outPrefix + ".data"; // Write data output to outfile PrintStream out = new PrintStream(new FileOutputStream(outDATA)); List<String> plotID = new ArrayList<>(ms2Spectra.size()); for (Pair<MS2Collected, Integer> pair : ms2Spectra) { MS2Collected yzSlice = pair.getLeft(); String caption; if (ms2SearchMZs != null && ms2SearchMZs.size() > 0) { caption = String.format("target: %.4f, time: %.4f, volts: %.4f, %d / %d matches", yzSlice.triggerMass, yzSlice.triggerTime, yzSlice.voltage, pair.getRight() == null ? 0 : pair.getRight(), ms2SearchMZs.size()); } else { caption = String.format("target: %.4f, time: %.4f, volts: %.4f", yzSlice.triggerMass, yzSlice.triggerTime, yzSlice.voltage); } plotID.add(caption); // Compute the total intensity on the fly so we can plot on a percentage scale. double maxIntensity = 0.0; for (YZ yz : yzSlice.ms2) { if (yz.intensity > maxIntensity) { maxIntensity = yz.intensity; } } // print out the spectra to outDATA for (YZ yz : yzSlice.ms2) { out.format("%.4f\t%.4f\n", yz.mz, 100.0 * yz.intensity / maxIntensity); out.flush(); } // delimit this dataset from the rest out.print("\n\n"); } // close the .data out.close(); // render outDATA to outPDF using gnuplot // 105.0 here means 105% for the y-range of a [0%:100%] plot. We want to leave some buffer space at // at the top, and hence we go a little outside of the 100% max range. /* We intend to plot the fragmentation pattern, and so do not expect to see fragments that are bigger than the * original selected molecule. We truncate the x-range to the specified m/z but since that will make values close * to the max hard to see we add a buffer and truncate the plot in the x-range to m/z + 50.0. */ new Gnuplotter().plot2DImpulsesWithLabels(outDATA, outPDF, plotID.toArray(new String[plotID.size()]), mz + 50.0, "mz", 105.0, "intensity (%)", fmt); } public static void main(String[] args) throws Exception { Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } String fmt = "pdf"; Double mz; try { mz = Double.parseDouble(cl.getOptionValue(OPTION_TARGET_TRIGGER_MZ)); } catch (NumberFormatException e) { System.err.format("Trigger mass must be a floating point number: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); return; // Silences compiler warnings for `mz`. } String outPrefix = cl.getOptionValue(OPTION_OUTPUT_PREFIX); String ms2mzml = cl.getOptionValue(OPTION_MZML_FILE); String ms2nc = cl.getOptionValue(OPTION_NETCDF_FILE); String[] ms2SearchMassStrings = cl.getOptionValues(OPTION_MS2_PEAK_SEARCH_MASSES); List<Double> ms2SearchMasses; if (ms2SearchMassStrings != null && ms2SearchMassStrings.length > 0) { ms2SearchMasses = new ArrayList<>(ms2SearchMassStrings.length); for (String m : ms2SearchMassStrings) { Double d; try { d = Double.parseDouble(m); } catch (NumberFormatException e) { System.err.format( "MS2 search mass must be a comma separated list of floating point numbers: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); return; // Silences compiler warnings for `d`. } ms2SearchMasses.add(d); } } else { ms2SearchMasses = new ArrayList<>(0); // Use an empty array rather than null for easier logic later. } Integer pickTopNMatches = null; if (cl.hasOption(OPTION_PICK_TOP_N_MATCHES)) { pickTopNMatches = Integer.valueOf(cl.getOptionValue(OPTION_PICK_TOP_N_MATCHES)); } MS2Simple c = new MS2Simple(); c.findAndPlotMatchingMS2Scans(mz, ms2SearchMasses, pickTopNMatches, ms2mzml, ms2nc, outPrefix, fmt); } }