Java tutorial
/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.lcms.db.analysis; import com.act.lcms.Gnuplotter; import com.act.lcms.LCMSNetCDFParser; import com.act.lcms.LCMSSpectrum; import com.act.lcms.MS1; import com.act.lcms.XZ; import com.act.lcms.db.io.DB; import com.act.lcms.db.model.ChemicalOfInterest; import com.act.lcms.db.model.LCMSWell; import com.act.lcms.db.model.MS1ScanForWellAndMassCharge; import com.act.lcms.db.model.Plate; import com.act.lcms.db.model.PlateWell; import com.act.lcms.db.model.ScanFile; import com.act.lcms.db.model.StandardIonResult; import com.act.lcms.db.model.StandardWell; import com.act.lcms.plotter.WriteAndPlotMS1Results; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.joda.time.LocalDateTime; import javax.xml.parsers.ParserConfigurationException; import javax.xml.stream.XMLStreamException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; public class AnalysisHelper { // This constant is the best score when a metlin ion is provided manually private static final Double MANUAL_OVERRIDE_BEST_SCORE = 0.0d; private static final Integer REPRESENTATIVE_INDEX = 0; private static final Set<String> EMPTY_SET = Collections.unmodifiableSet(new HashSet<>(0)); private static final Logger LOGGER = LogManager.getFormatterLogger(AnalysisHelper.class); private static <A, B> Pair<List<A>, List<B>> split(List<Pair<A, B>> lpairs) { List<A> a = new ArrayList<>(); List<B> b = new ArrayList<>(); for (Pair<A, B> p : lpairs) { a.add(p.getLeft()); b.add(p.getRight()); } return Pair.of(a, b); } /** * Process a list of wells (LCMS or Standard), producing a list of scan objects that encapsulate the plate, * scan file, and masses for that well. * @param db The DB from which to extract plate data. * @param lcmsDir The directory where the LCMS scans live. * @param searchMZs A list of target M/Zs to search for in the scans (see API for {@link MS1}. * @param kind The role of this well in this analysis (standard, positive sample, negative control). * @param plateCache A hash of Plates already accessed from the DB. * @param samples A list of wells to process. * @param useSNRForPeakIdentification If true, signal-to-noise ratio will be used for peak identification. If not, * peaks will be identified by intensity. * @param <T> The PlateWell type whose scans to process. * @return A list of ScanData objects that wraps the objects required to produce a graph for each specified well. * @throws Exception */ public static <T extends PlateWell<T>> Pair<List<ScanData<T>>, Double> processScans(DB db, File lcmsDir, List<Pair<String, Double>> searchMZs, ScanData.KIND kind, HashMap<Integer, Plate> plateCache, List<T> samples, boolean useFineGrainedMZTolerance, Set<String> includeIons, Set<String> excludeIons, boolean useSNRForPeakIdentification) throws Exception { Double maxIntensity = 0.0d; List<ScanData<T>> allScans = new ArrayList<>(samples.size()); for (T well : samples) { // The foreign key constraint on wells ensure that plate will be non-null. Plate plate = plateCache.get(well.getPlateId()); if (plate == null) { plate = Plate.getPlateById(db, well.getPlateId()); plateCache.put(plate.getId(), plate); } LOGGER.info("Processing LCMS well %s %s", plate.getBarcode(), well.getCoordinatesString()); List<ScanFile> scanFiles = ScanFile.getScanFileByPlateIDRowAndColumn(db, well.getPlateId(), well.getPlateRow(), well.getPlateColumn()); if (scanFiles == null || scanFiles.size() == 0) { LOGGER.error("WARNING: No scan files available for %s %s", plate.getBarcode(), well.getCoordinatesString()); continue; } for (ScanFile sf : scanFiles) { if (sf.getFileType() != ScanFile.SCAN_FILE_TYPE.NC) { // TODO: Migrate sysem.err to LOGGER framework LOGGER.error("Skipping scan file with non-NetCDF format: %s", sf.getFilename()); continue; } File localScanFile = new File(lcmsDir, sf.getFilename()); if (!localScanFile.exists() && localScanFile.isFile()) { LOGGER.error("WARNING: could not find regular file at expected path: %s", localScanFile.getAbsolutePath()); continue; } MS1 mm = new MS1(useFineGrainedMZTolerance, useSNRForPeakIdentification); for (Pair<String, Double> searchMZ : searchMZs) { MS1.IonMode mode = MS1.IonMode.valueOf(sf.getMode().toString().toUpperCase()); Map<String, Double> allMasses = mm.getIonMasses(searchMZ.getRight(), mode); Map<String, Double> metlinMasses = Utils.filterMasses(allMasses, includeIons, excludeIons); MS1ScanForWellAndMassCharge ms1ScanResults; List<ChemicalOfInterest> chemicalsOfInterest = ChemicalOfInterest.getInstance() .getChemicalOfInterestByName(db, searchMZ.getLeft()); // Check if in the input chemical is valid if (chemicalsOfInterest == null || chemicalsOfInterest.size() == 0) { MS1 ms1 = new MS1(); ms1ScanResults = ms1.getMS1(metlinMasses, localScanFile.getAbsolutePath()); } else { MS1ScanForWellAndMassCharge ms1ScanResultsCache = new MS1ScanForWellAndMassCharge(); ms1ScanResults = ms1ScanResultsCache.getByPlateIdPlateRowPlateColUseSnrScanFileChemical(db, plate, well, true, sf, searchMZ.getLeft(), metlinMasses, localScanFile); } maxIntensity = Math.max(ms1ScanResults.getMaxYAxis(), maxIntensity); LOGGER.info("Max intensity for target %s (%f) in %s is %f", searchMZ.getLeft(), searchMZ.getRight(), sf.getFilename(), ms1ScanResults.getMaxYAxis()); // TODO: purge the MS1 spectra from ms1ScanResults if this ends up hogging too much memory. allScans.add(new ScanData<T>(kind, plate, well, sf, searchMZ.getLeft(), metlinMasses, ms1ScanResults)); } } } return Pair.of(allScans, maxIntensity); } /** * This function gets the intensity-time values for each mass charge in a scan file and packages that up into a mapping * * between the mass charge pair and ScanData. * @param db The db to query scan files from * @param lcmsDir The lcms dir where the lcms files are found * @param searchMZs The pair of chemical and mass charge pairs * @param kind The kind of plate the lcms was run over * @param plateCache The plate cache * @param scanFile The scan file being examined * @param well The well being analyzed * @param useFineGrainedMZTolerance boolean for MZ tolerance * @param useSNRForPeakIdentification If true, signal-to-noise ratio will be used for peak identification. If not, * peaks will be identified by intensity. * @param <T> The platewell abstraction * @return A mapping of mass charge to scandata * @throws Exception */ public static <T extends PlateWell<T>> Map<Pair<String, Double>, ScanData<T>> getIntensityTimeValuesForEachMassChargeInScanFile( DB db, File lcmsDir, Set<Pair<String, Double>> searchMZs, ScanData.KIND kind, HashMap<Integer, Plate> plateCache, ScanFile scanFile, T well, boolean useFineGrainedMZTolerance, boolean useSNRForPeakIdentification) throws ParserConfigurationException, IOException, XMLStreamException, SQLException { // The foreign key constraint on wells ensure that plate will be non-null. Plate plate = plateCache.get(well.getPlateId()); if (plate == null) { plate = Plate.getPlateById(db, well.getPlateId()); plateCache.put(plate.getId(), plate); } if (scanFile.getFileType() != ScanFile.SCAN_FILE_TYPE.NC) { LOGGER.error("Skipping scan file with non-NetCDF format: %s", scanFile.getFilename()); return null; } File localScanFile = new File(lcmsDir, scanFile.getFilename()); if (!localScanFile.exists() && localScanFile.isFile()) { LOGGER.error("WARNING: could not find regular file at expected path: %s", localScanFile.getAbsolutePath()); return null; } Map<Pair<String, Double>, ScanData<T>> result = new HashMap<>(); MS1 mm = new MS1(useFineGrainedMZTolerance, useSNRForPeakIdentification); Map<Pair<String, Double>, MS1ScanForWellAndMassCharge> massChargeToMS1Results = getMultipleMS1s(mm, searchMZs, localScanFile.getAbsolutePath()); for (Map.Entry<Pair<String, Double>, MS1ScanForWellAndMassCharge> entry : massChargeToMS1Results .entrySet()) { String chemicalName = entry.getKey().getLeft(); Double massCharge = entry.getKey().getRight(); MS1ScanForWellAndMassCharge ms1ScanForWellAndMassCharge = entry.getValue(); Map<String, Double> singletonMass = Collections.singletonMap(chemicalName, massCharge); result.put(entry.getKey(), new ScanData<T>(kind, plate, well, scanFile, chemicalName, singletonMass, ms1ScanForWellAndMassCharge)); } return result; } private static Map<Pair<String, Double>, MS1ScanForWellAndMassCharge> getMultipleMS1s(MS1 ms1, Set<Pair<String, Double>> metlinMasses, String ms1File) throws ParserConfigurationException, IOException, XMLStreamException { // In order for this to sit well with the data model we'll need to ensure the keys are all unique. Set<String> uniqueKeys = new HashSet<>(); metlinMasses.stream().map(Pair::getLeft).forEach(x -> { if (uniqueKeys.contains(x)) { throw new RuntimeException( String.format("Assumption violation: found duplicate metlin mass keys: %s", x)); } uniqueKeys.add(x); }); Iterator<LCMSSpectrum> ms1Iterator = new LCMSNetCDFParser().getIterator(ms1File); Map<Double, List<XZ>> scanLists = new HashMap<>(metlinMasses.size()); // Initialize reading buffers for all of the target masses. metlinMasses.forEach(x -> { if (!scanLists.containsKey(x.getRight())) { scanLists.put(x.getRight(), new ArrayList<>()); } }); // De-dupe by mass in case we have exact duplicates, sort for well-ordered extractions. List<Double> sortedMasses = new ArrayList<>(scanLists.keySet()); /* Note: this operation is O(n * m) where n is the number of (mass, intensity) readings from the scan * and m is the number of mass targets specified. We might be able to get this down to O(m log n), but * we'll save that for once we get this working at all. */ while (ms1Iterator.hasNext()) { LCMSSpectrum timepoint = ms1Iterator.next(); // get all (mz, intensity) at this timepoint List<Pair<Double, Double>> intensities = timepoint.getIntensities(); // for this timepoint, extract each of the ion masses from the METLIN set for (Double ionMz : sortedMasses) { // this time point is valid to look at if its max intensity is around // the mass we care about. So lets first get the max peak location double intensityForMz = ms1.extractMZ(ionMz, intensities); // the above is Pair(mz_extracted, intensity), where mz_extracted = mz // we now add the timepoint val and the intensity to the output XZ intensityAtThisTime = new XZ(timepoint.getTimeVal(), intensityForMz); scanLists.get(ionMz).add(intensityAtThisTime); } } Map<Pair<String, Double>, MS1ScanForWellAndMassCharge> finalResults = new HashMap<>(metlinMasses.size()); /* Note: we might be able to squeeze more performance out of this by computing the * stats once per trace and then storing them. But the time to compute will probably * be dwarfed by the time to extract the data (assuming deduplication was done ahead * of time), so we'll leave it as is for now. */ for (Pair<String, Double> pair : metlinMasses) { String label = pair.getLeft(); Double mz = pair.getRight(); MS1ScanForWellAndMassCharge result = new MS1ScanForWellAndMassCharge(); result.setMetlinIons(Collections.singletonList(label)); result.getIonsToSpectra().put(label, scanLists.get(mz)); ms1.computeAndStorePeakProfile(result, label); // DO NOT use isGoodPeak here. We want positive and negative results alike. // There's only one ion in this scan, so just use its max. Double maxIntensity = result.getMaxIntensityForIon(label); result.setMaxYAxis(maxIntensity); // How/why is this not IonsToMax? Just set it as such for this. result.setIndividualMaxIntensities(Collections.singletonMap(label, maxIntensity)); finalResults.put(pair, result); } return finalResults; } /** * Write the time/intensity data for a given scan to an output stream. * * Note that the signature of ScanData is intentionally weakened to allow us to conditionally handle LCMSWell or * StandardWell objects contained in scanData. * * @param fos The output stream to which to write the time/intensity data. * @param lcmsDir The directory where the LCMS scan data can be found. * @param maxIntensity The maximum intensity for all scans in the ultimate graph to be produced. * @param scanData The scan data whose values will be written. * @param ionsToWrite A set of ions to write; all available ions are written if this is null. * @return A list of graph labels for each LCMS file in the scan. * @throws Exception */ public static List<String> writeScanData(FileOutputStream fos, File lcmsDir, Double maxIntensity, ScanData scanData, boolean makeHeatmaps, boolean applyThreshold, Set<String> ionsToWrite) throws Exception { if (ScanData.KIND.BLANK == scanData.getKind()) { return Collections.singletonList(Gnuplotter.DRAW_SEPARATOR); } Plate plate = scanData.getPlate(); ScanFile sf = scanData.getScanFile(); Map<String, Double> metlinMasses = scanData.getMetlinMasses(); File localScanFile = new File(lcmsDir, sf.getFilename()); MS1ScanForWellAndMassCharge ms1ScanResults = scanData.getMs1ScanResults(); WriteAndPlotMS1Results plottingUtil = new WriteAndPlotMS1Results(); List<Pair<String, String>> ionsAndLabels = plottingUtil.writeMS1Values(ms1ScanResults.getIonsToSpectra(), maxIntensity, metlinMasses, fos, makeHeatmaps, applyThreshold, ionsToWrite); List<String> ionLabels = split(ionsAndLabels).getRight(); LOGGER.info("Scan for target %s has ion labels: %s", scanData.getTargetChemicalName(), StringUtils.join(ionLabels, ", ")); List<String> graphLabels = new ArrayList<>(ionLabels.size()); if (scanData.getWell() instanceof LCMSWell) { for (String label : ionLabels) { LCMSWell well = (LCMSWell) scanData.getWell(); String l = String.format("%s (%s fed %s) @ %s %s %s, %s %s", well.getComposition(), well.getMsid(), well.getChemical() == null || well.getChemical().isEmpty() ? "nothing" : well.getChemical(), plate.getBarcode(), well.getCoordinatesString(), sf.getMode().toString().toLowerCase(), scanData.getTargetChemicalName(), label); LOGGER.info("Adding graph w/ label %s", l); graphLabels.add(l); } } else if (scanData.getWell() instanceof StandardWell) { for (String label : ionLabels) { StandardWell well = (StandardWell) scanData.getWell(); String l = String.format("Standard %s @ %s %s %s, %s %s", well.getChemical() == null || well.getChemical().isEmpty() ? "nothing" : well.getChemical(), plate.getBarcode(), well.getCoordinatesString(), sf.getMode().toString().toLowerCase(), scanData.getTargetChemicalName(), label); LOGGER.info("Adding graph w/ label %s", l); graphLabels.add(l); } } else { throw new RuntimeException(String.format("Graph request for well type %s", scanData.getWell().getClass().getCanonicalName())); } LOGGER.info("Done processing file at %s", localScanFile.getAbsolutePath()); return graphLabels; } /** * This function picks the best scan file based on two critereon: a) The scan file has to be a positive scan file * b) The scan file has to be of the latest lcms run for the well. * @param db The db to query scan files from * @param well The well being used for the analysis * @param <T> The platewell type abstraction * @return The best ScanFile * @throws Exception */ public static <T extends PlateWell<T>> ScanFile pickBestScanFileForWell(DB db, T well) throws Exception { List<ScanFile> scanFiles = ScanFile.getScanFileByPlateIDRowAndColumn(db, well.getPlateId(), well.getPlateRow(), well.getPlateColumn()); // TODO: We only analyze positive scan files for now since we are not confident with the negative scan file results. // Since we perform multiple scans on the same well, we need to categorize the data based on date. ScanFile latestScanFiles = null; LocalDateTime latestDateTime = null; for (ScanFile scanFile : scanFiles) { if (!scanFile.isNegativeScanFile()) { LocalDateTime scanDate = scanFile.getDateFromScanFileTitle(); // Pick the newest scan files if (latestDateTime == null || scanDate.isAfter(latestDateTime)) { latestScanFiles = scanFile; latestDateTime = scanDate; } } } return latestScanFiles; } public static String constructChemicalAndScanTypeName(String name, ScanData.KIND kind) { return kind.equals(ScanData.KIND.POS_SAMPLE) ? name + "_Positive" : name + "_Negative"; } /** * This function constructs a ChemicalToMapOfMetlinIonsToIntensityTimeValues object from the scan data per mass charge * name and value. * @param massChargePairToScanDataResult A mapping for mass charge to scan data * @param kind The kind of well * @param <T> * @return A ChemicalToMapOfMetlinIonsToIntensityTimeValues object. */ public static <T extends PlateWell<T>> ChemicalToMapOfMetlinIonsToIntensityTimeValues constructChemicalToMapOfMetlinIonsToIntensityTimeValuesFromMassChargeData( Map<Pair<String, Double>, ScanData<T>> massChargePairToScanDataResult, ScanData.KIND kind) { ChemicalToMapOfMetlinIonsToIntensityTimeValues peakData = new ChemicalToMapOfMetlinIonsToIntensityTimeValues(); for (Map.Entry<Pair<String, Double>, ScanData<T>> entry : massChargePairToScanDataResult.entrySet()) { String chemicalName = entry.getKey().getLeft(); ScanData<T> scan = entry.getValue(); // get all the scan results for each metlin mass combination for a given compound. Map<String, List<XZ>> ms1s = scan.getMs1ScanResults().getIonsToSpectra(); String plotName = constructChemicalAndScanTypeName(chemicalName, kind); // Read intensity and time data for each metlin mass. We only expect one mass charge pair per ms1ScanResults // since we are extracting traces from the scan files via getMultipleMS1s. for (Map.Entry<String, List<XZ>> ms1ForIon : ms1s.entrySet()) { String ion = ms1ForIon.getKey(); List<XZ> ms1 = ms1ForIon.getValue(); peakData.addIonIntensityTimeValueToChemical(plotName, ion, ms1); } } return peakData; } /** * This function filters out negative scan data, then categorizes the remaining based on dates, followed by finding * a set of scan data with the lowest noise. Based on this filtered set of data, it constructs a * ChemicalToMapOfMetlinIonsToIntensityTimeValues object that is a mapping of chemical to metlin ion to intensity/time * values for each ion. * @param db * @param lcmsDir - The directory where the LCMS scan data can be found. * @param searchMZs - A list of target M/Zs to search for in the scans (see API for {@link MS1}. * @param kind - The role of this well in this analysis (standard, positive sample, negative control) * @param plateCache - A hash of Plates already accessed from the DB. * @param samples - A list of wells to process. * @param useSNRForPeakIdentification - If true, signal-to-noise ratio will be used for peak identification. If not, * peaks will be identified by intensity. * @param targetChemical - A string associated with the chemical name. * @return - A mapping of chemical to metlin ion to intensity/time values. * @throws Exception */ public static ChemicalToMapOfMetlinIonsToIntensityTimeValues readStandardWellScanData(DB db, File lcmsDir, List<Pair<String, Double>> searchMZs, ScanData.KIND kind, HashMap<Integer, Plate> plateCache, List<StandardWell> samples, boolean useFineGrainedMZTolerance, Set<String> includeIons, Set<String> excludeIons, boolean useSNRForPeakIdentification, String targetChemical) throws Exception { List<ScanData<StandardWell>> allScans = processScans(db, lcmsDir, searchMZs, kind, plateCache, samples, useFineGrainedMZTolerance, includeIons, excludeIons, useSNRForPeakIdentification).getLeft(); // If there are no scans found, the client should handle this situation. So we return null. if (allScans.size() == 0) { LOGGER.error("WARNING: No scans were found."); return null; } // TODO: We only analyze positive scan files for now since we are not confident with the negative scan file results. // Since we can perform multiple scans on the same well, we need to categorize the data based on date. Map<LocalDateTime, List<ScanData<StandardWell>>> filteredScansCategorizedByDate = new HashMap<>(); Map<LocalDateTime, List<ScanData<StandardWell>>> postFilteredScansCategorizedByDate = new HashMap<>(); for (ScanData<StandardWell> scan : allScans) { if (!scan.scanFile.isNegativeScanFile()) { LocalDateTime scanDate = scan.scanFile.getDateFromScanFileTitle(); List<ScanData<StandardWell>> scanDataForDate = filteredScansCategorizedByDate.get(scanDate); if (scanDataForDate == null) { scanDataForDate = new ArrayList<>(); } scanDataForDate.add(scan); filteredScansCategorizedByDate.put(scanDate, scanDataForDate); } } // Filter out date categories that do not contain the target chemical for (Map.Entry<LocalDateTime, List<ScanData<StandardWell>>> entry : filteredScansCategorizedByDate .entrySet()) { Boolean containsTargetChemical = false; for (ScanData<StandardWell> scanData : entry.getValue()) { if (scanData.getWell().getChemical().equals(targetChemical)) { containsTargetChemical = true; } } if (containsTargetChemical) { postFilteredScansCategorizedByDate.put(entry.getKey(), entry.getValue()); } } // Choose the date where the target chemical's scan file has the lowest noise across all ions. // TODO: Is there a better way of choosing between scanfiles categorized between dates? LocalDateTime bestDate = null; Double lowestNoise = Double.MAX_VALUE; for (Map.Entry<LocalDateTime, List<ScanData<StandardWell>>> entry : postFilteredScansCategorizedByDate .entrySet()) { for (ScanData<StandardWell> scanData : entry.getValue()) { if (scanData.getWell().getChemical().equals(targetChemical)) { if (WaveformAnalysis .maxNoiseOfSpectra(scanData.getMs1ScanResults().getIonsToSpectra()) < lowestNoise) { lowestNoise = WaveformAnalysis .maxNoiseOfSpectra(scanData.getMs1ScanResults().getIonsToSpectra()); bestDate = entry.getKey(); } } } } // At this point, we guarantee that each standard well chemical is run only once on a given day. List<ScanData<StandardWell>> representativeListOfScanFiles = postFilteredScansCategorizedByDate .get(bestDate); // We use this below container to hold the scandata of a particular chemical with the highest hash code among // all scandata of the given chemical. We do so that if we find two standard wells of the same chemical run on the // same day, we consistently pick the same well over multiple runs. Map<String, ScanData<StandardWell>> chemicalToHighestScanDataHashCode = new HashMap<>(); for (ScanData<StandardWell> scan : representativeListOfScanFiles) { ScanData<StandardWell> result = chemicalToHighestScanDataHashCode.get(scan.getWell().getChemical()); if (result == null) { result = scan; } else { if (scan.hashCode() > result.hashCode()) { result = scan; } } chemicalToHighestScanDataHashCode.put(scan.getWell().getChemical(), result); } ChemicalToMapOfMetlinIonsToIntensityTimeValues peakData = new ChemicalToMapOfMetlinIonsToIntensityTimeValues(); for (Map.Entry<String, ScanData<StandardWell>> chemicalToScanDataWithHighestHashCode : chemicalToHighestScanDataHashCode .entrySet()) { ScanData<StandardWell> scan = chemicalToScanDataWithHighestHashCode.getValue(); // get all the scan results for each metlin mass combination for a given compound. MS1ScanForWellAndMassCharge ms1ScanResults = scan.getMs1ScanResults(); Map<String, List<XZ>> ms1s = ms1ScanResults.getIonsToSpectra(); // read intensity and time data for each metlin mass for (Map.Entry<String, List<XZ>> ms1ForIon : ms1s.entrySet()) { String ion = ms1ForIon.getKey(); List<XZ> ms1 = ms1ForIon.getValue(); peakData.addIonIntensityTimeValueToChemical(scan.getWell().getChemical(), ion, ms1); } } return peakData; } /** * This function does a naive scoring algorithm where it just picks the first element of the sorted hashed map as * the best metlin ion. * @param sortedIonList - This is sorted map of ion to best intensity,time values. * @return The lowest score ion, which is the best prediction. */ public static String getBestMetlinIonFromPossibleMappings(LinkedHashMap<String, XZ> sortedIonList) { String result = ""; for (Map.Entry<String, XZ> metlinIonToData : sortedIonList.entrySet()) { // Get the first value from the input since it is already sorted. result = metlinIonToData.getKey(); break; } return result; } public static List<String> writeScanData(FileOutputStream fos, File lcmsDir, Double maxIntensity, ScanData scanData, boolean makeHeatmaps, boolean applyThreshold) throws Exception { return writeScanData(fos, lcmsDir, maxIntensity, scanData, makeHeatmaps, applyThreshold, null); } /** * This function scores the various metlin ions from different standard ion results, sorts them and picks the * best ion. This is done by adding up the indexed positions of the ion in each sorted entry of the list of * standard ion results. Since the entries in the standard ion results are sorted, the lower magnitude summation ions * are better than the larger magnitude summations. Then, we add another feature, in this case, the normalized SNR/maxSNR * but multiplying the positional score with the normalized SNR. The exact calculation is as follows: * score = positional_score * (1 - SNR(i)/maxSNR). We have to do the (1 - rel_snr) since we choose the lowest score, * so if the rel_snr is huge (ie a good signal), the overall magnitude of score will reduce, which makes that a better * ranking for the ion. We then do a post filtering on these scores based on if we have only positive/negative scans * from the scan files which exist in the context of the caller. * @param standardIonResults The list of standard ion results * @param curatedMetlinIons A map from standard ion result to the best curated ion that was manual inputted. * @param areOtherPositiveModeScansAvailable This boolean is used to post filter and pick a positive metlin ion if and * only if positive ion mode scans are available. * @param areOtherNegativeModeScansAvailable This boolean is used to post filter and pick a negative metlin ion if and * only if negative ion mode scans are available. * @return The best metlin ion or null if none can be found */ public static String scoreAndReturnBestMetlinIonFromStandardIonResults( List<StandardIonResult> standardIonResults, Map<StandardIonResult, String> curatedMetlinIons, boolean areOtherPositiveModeScansAvailable, boolean areOtherNegativeModeScansAvailable) { if (standardIonResults == null) { return null; } // We find the maximum SNR values for each standard ion result so that we can normalize individual SNR scores // during scoring. HashMap<StandardIonResult, Double> resultToMaxSNR = new HashMap<>(); for (StandardIonResult result : standardIonResults) { Double maxSNR = 0.0d; for (Map.Entry<String, XZ> resultoDoublePair : result.getAnalysisResults().entrySet()) { if (resultoDoublePair.getValue().getIntensity() > maxSNR) { maxSNR = resultoDoublePair.getValue().getIntensity(); } } resultToMaxSNR.put(result, maxSNR); } Map<String, Double> metlinScore = new HashMap<>(); Set<String> ions = standardIonResults.get(0).getAnalysisResults().keySet(); // For each ion, iterate through all the ion results to find the position of that ion in each result set (since the // ions are sorted) and then multiply that by a normalized value of the SNR. for (String ion : ions) { for (StandardIonResult result : standardIonResults) { Integer counter = 0; for (String localIon : result.getAnalysisResults().keySet()) { counter++; if (localIon.equals(ion)) { Double ionScore = metlinScore.get(ion); if (ionScore == null) { // Normalize the sample's SNR by dividing it by the maxSNR. Then we multiple a variant of it to the counter // score so that if the total magnitude of the score is lower, the ion is ranked higher. ionScore = (1.0 * counter) * (1 - (result.getAnalysisResults().get(ion).getIntensity() / resultToMaxSNR.get(result))); } else { ionScore += (1.0 * counter) * (1 - (result.getAnalysisResults().get(ion).getIntensity() / resultToMaxSNR.get(result))); } metlinScore.put(ion, ionScore); break; } } } } for (Map.Entry<StandardIonResult, String> resultToIon : curatedMetlinIons.entrySet()) { // Override all the scores of the manually curated standard ion result and set them to the highest rank. // Ideally, the user has been consistent for the best metlin ion across similar standard ion results, so // tie breakers will not happen. If a tie happen, it is broken arbitrarily. metlinScore.put(resultToIon.getValue(), MANUAL_OVERRIDE_BEST_SCORE); } TreeMap<Double, List<String>> sortedScores = new TreeMap<>(); for (String ion : metlinScore.keySet()) { if (MS1.getIonModeOfIon(ion) != null) { if ((MS1.getIonModeOfIon(ion).equals(MS1.IonMode.POS) && areOtherPositiveModeScansAvailable) || (MS1.getIonModeOfIon(ion).equals(MS1.IonMode.NEG) && areOtherNegativeModeScansAvailable)) { List<String> ionBucket = sortedScores.get(metlinScore.get(ion)); if (ionBucket == null) { ionBucket = new ArrayList<>(); } ionBucket.add(ion); sortedScores.put(metlinScore.get(ion), ionBucket); } } } if (sortedScores.size() == 0) { LOGGER.error( "Could not find any ions corresponding to the positive and negative scan mode conditionals"); return null; } else { List<String> topMetlinIons = sortedScores.get(sortedScores.keySet().iterator().next()); // In cases of a tie breaker, simply choose the first ion. return topMetlinIons.get(0); } } /** * This function takes a well as input, finds all the scan files associated with that well, then picks a representative * scan file, in this case, the first scan file which has the NC file format. It then extracts the ms1 scan results * corresponding to that scan file and packages it up into a ScanData container. * @param db - The db from which the data is extracteds * @param lcmsDir - The dir were scan files are present * @param well - The well based on which the scan file is founds * @param chemicalForMZValue - This is chemical from which the mz values that are needed from the ms1 analysis is extracted. * @param targetChemical - This is the target chemical for the analysis, ie find all chemicalForMZValue's mz variates * within targetChemical's ion profile. * @return ScanData - The resultant scan data. * @throws Exception */ public static ScanData<StandardWell> getScanDataForWell(DB db, File lcmsDir, StandardWell well, String chemicalForMZValue, String targetChemical) throws Exception { Plate plate = Plate.getPlateById(db, well.getPlateId()); List<ScanFile> scanFiles = ScanFile.getScanFileByPlateIDRowAndColumn(db, well.getPlateId(), well.getPlateRow(), well.getPlateColumn()); ScanFile representativeScanFile = null; for (ScanFile scanFile : scanFiles) { if (scanFile.getFileType() == ScanFile.SCAN_FILE_TYPE.NC) { representativeScanFile = scanFile; break; } } if (representativeScanFile == null) { throw new RuntimeException("None of the scan files are of the NC format"); } File localScanFile = new File(lcmsDir, representativeScanFile.getFilename()); if (!localScanFile.exists() && localScanFile.isFile()) { LOGGER.warn("Could not find regular file at expected path: %s", localScanFile.getAbsolutePath()); return null; } Pair<String, Double> mzValue = Utils.extractMassFromString(db, chemicalForMZValue); MS1 mm = new MS1(); // TODO: Unify these enums. MS1.IonMode mode = MS1.IonMode.valueOf(representativeScanFile.getMode().toString().toUpperCase()); Map<String, Double> allMasses = mm.getIonMasses(mzValue.getRight(), mode); Map<String, Double> metlinMasses = Utils.filterMasses(allMasses, EMPTY_SET, EMPTY_SET); MS1ScanForWellAndMassCharge ms1ScanResultsCache = new MS1ScanForWellAndMassCharge(); MS1ScanForWellAndMassCharge ms1ScanResultsForPositiveControl = ms1ScanResultsCache .getByPlateIdPlateRowPlateColUseSnrScanFileChemical(db, plate, well, true, representativeScanFile, targetChemical, metlinMasses, localScanFile); ScanData<StandardWell> encapsulatedDataForPositiveControl = new ScanData<StandardWell>( ScanData.KIND.STANDARD, plate, well, representativeScanFile, targetChemical, metlinMasses, ms1ScanResultsForPositiveControl); return encapsulatedDataForPositiveControl; } }