Java tutorial
/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.lcms.db.analysis; import com.act.lcms.XZ; import com.act.lcms.db.model.LCMSWell; import com.act.lcms.db.model.StandardWell; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.FileWriter; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; public class WaveformAnalysis { private static final int START_INDEX = 0; public static final int COMPRESSION_CONSTANT = 5; private static final Double DEFAULT_LOWEST_RMS_VALUE = 1.0; private static final Logger LOGGER = LogManager.getFormatterLogger(WaveformAnalysis.class); // We chose this value as a heuristic on how much time drift we are willing to accept for our analysis in seconds. public static final Double RESTRICTED_RETENTION_TIME_WINDOW_IN_SECONDS = 5.0; // This constant is applied to comparing the negative controls against the positive. private static final Double POSITION_TIME_WINDOW_IN_SECONDS = 1.0; //Delimiter used in CSV file private static final String COMMA_DELIMITER = ","; private static final String NEW_LINE_SEPARATOR = "\n"; /** * This function sums up over a series of intensity/time values. * @param list A list of intensity/time points. * @return A point of summed intensities with the time set to the start of the list. */ private static XZ sumIntensityAndTimeList(List<XZ> list) { // We use the first value's time as a period over which the intensities are summed over. This is a conscious // choice to standardize the summation analysis. The trade offs are that the final output will // not have an accurate time period and will always underestimate the actual time, but since // the time period over which it is the summed is small (< 1 second), the underestimation is within comfortable // bounds. Double time = list.get(START_INDEX).getTime(); Double intensitySum = 0.0; for (XZ point : list) { intensitySum += point.getIntensity(); } return new XZ(time, intensitySum); } /** * This function calculates the root mean squared of a collection of intensity/time graphs. It does this by finding * the root mean squared across every time period of the list of intensity/time graphs. * @param graphs A list of intensity/time graphs * @return A list of rms values. */ private static List<XZ> rmsOfIntensityTimeGraphs(List<List<XZ>> graphs) { // Since the input graphs could be of different lengths, we need to find the smallest list as the representative // size to do the analysis or else we will get null pointer exceptions. Doing this is OK from an analysis perspective // since size differences only manifest at the end of the LCMS readings (ie. timings are the same at the start but // get truncated at the end) and there are <10 point difference between the graphs based on inspection. // TODO: Alert the user if there are huge size differences between the graph size. int representativeSize = graphs.get(START_INDEX).size(); for (List<XZ> graph : graphs) { representativeSize = Math.min(representativeSize, graph.size()); } int representativeGraph = START_INDEX; for (int index = 0; index < graphs.size(); index++) { if (graphs.get(index).size() < representativeGraph) { representativeSize = graphs.get(index).size(); representativeGraph = index; } } List<XZ> rmsList = new ArrayList<>(representativeSize); for (int i = 0; i < representativeSize; i++) { // The representationTime is set to the time of the graph with the shortest length at the index i. Double representativeTime = graphs.get(representativeGraph).get(i).getTime(); Double intensitySquaredSum = 0.0; // RMS is sqrt(sum(X^2)/len) for (int j = 0; j < graphs.size(); j++) { List<XZ> chart = graphs.get(j); intensitySquaredSum += Math.pow(chart.get(i).getIntensity(), 2); } Double rms = Math.pow(intensitySquaredSum / graphs.size(), 0.5); // We make sure the RMS is atleast one as our floor for the number of ions that hit the detector, so that we // do not get an amplification of the SNR based on extremely low noise conditions which exist in some of the // runs. if (rms < DEFAULT_LOWEST_RMS_VALUE) { rms = DEFAULT_LOWEST_RMS_VALUE; } rmsList.add(i, new XZ(representativeTime, rms)); } return rmsList; } /** * The noise of a given spectrum using a biased std derivation * @param spectrum * @return The relative noise of a spectrum. */ public static Double noiseOfSpectrum(List<XZ> spectrum) { Double movingAverage = 0.0; Double averageMeanSquared = 0.0; Double sizeOfSpectrum = spectrum.size() * 1.0; for (XZ point : spectrum) { movingAverage += point.getIntensity() / sizeOfSpectrum; } for (XZ point : spectrum) { averageMeanSquared += Math.pow(point.getIntensity() - movingAverage, 2) / sizeOfSpectrum; } return Math.pow(averageMeanSquared, 0.5); } /** * This function returns the maximum noise among a map of ion to list of spectra * @param spectra A map of ion to spectrum * @return The maximum noise of the map */ public static Double maxNoiseOfSpectra(Map<String, List<XZ>> spectra) { Double maxNoise = Double.MIN_VALUE; for (Map.Entry<String, List<XZ>> ionToSpectrum : spectra.entrySet()) { maxNoise = Math.max(maxNoise, noiseOfSpectrum(ionToSpectrum.getValue())); } return maxNoise; } /** * This function compresses a given list of time series data based on a period compression value. * @param intensityAndTime A list of intensity/time data * @param compressionMagnitude This value is the magnitude by which the data is compressed in the time dimension. * @return A pair of a list of intensity/time data that is compressed and a mapping from time to max peak intensity in * that window. */ public static Pair<List<XZ>, Map<Double, Double>> compressIntensityAndTimeGraphsAndFindMaxIntensityInEveryTimeWindow( List<XZ> intensityAndTime, int compressionMagnitude) { ArrayList<XZ> compressedResult = new ArrayList<>(); Map<Double, Double> timeToIntensity = new HashMap<>(); if (intensityAndTime == null) { System.out.println("intensity time is null"); System.exit(1); } for (int i = 0; i < intensityAndTime.size() / compressionMagnitude; i++) { int startIndex = i * compressionMagnitude; int endIndex = startIndex + compressionMagnitude; List<XZ> subListSum = intensityAndTime.subList(startIndex, endIndex > intensityAndTime.size() ? intensityAndTime.size() : endIndex); Double maxIntensity = 0.0; for (XZ xz : subListSum) { maxIntensity = Math.max(maxIntensity, xz.getIntensity()); } // Make sure that the size of the sublist has atleast one element in it. if (subListSum.size() > 0) { compressedResult.add(sumIntensityAndTimeList(subListSum)); timeToIntensity.put(subListSum.get(START_INDEX).getTime(), maxIntensity); } } return Pair.of(compressedResult, timeToIntensity); } public static Map<String, Pair<XZ, Double>> performSNRAnalysisAndReturnMetlinIonsRankOrderedBySNRForWells( ChemicalToMapOfMetlinIonsToIntensityTimeValues ionToIntensityDataPos, List<ChemicalToMapOfMetlinIonsToIntensityTimeValues> ionToIntensityDataNegList, Set<Pair<String, Double>> searchMZs) { Map<String, Pair<XZ, Double>> result = new HashMap<>(); for (Pair<String, Double> mz : searchMZs) { String chemicalName = mz.getLeft(); // Compress the input intensity time graph to solve sparse data issues (multiple retention times where intensity // is zero). However, we make sure to preserve what the maximum intensity was in that time window in the function // called below. Pair<List<XZ>, Map<Double, Double>> positiveXZValuesAndMaxIntensity = compressIntensityAndTimeGraphsAndFindMaxIntensityInEveryTimeWindow( ionToIntensityDataPos.getMetlinIonsOfChemical( AnalysisHelper.constructChemicalAndScanTypeName(chemicalName, ScanData.KIND.POS_SAMPLE)) .get(chemicalName), COMPRESSION_CONSTANT); List<XZ> positiveIntensityTimeValues = positiveXZValuesAndMaxIntensity.getLeft(); Map<Double, Double> positiveTimeToMaxPeak = positiveXZValuesAndMaxIntensity.getRight(); // Next, we detect peaks within the compressed data. List<XZ> positiveIntensityTime = detectPeaksInIntensityTimeWaveform(positiveIntensityTimeValues, PEAK_DETECTION_THRESHOLD); // Get the compressed results for the negative control data. List<List<XZ>> negativeIntensityTimes = new ArrayList<>(); for (ChemicalToMapOfMetlinIonsToIntensityTimeValues neg : ionToIntensityDataNegList) { List<XZ> negativeIntensityTimeValues = compressIntensityAndTimeGraphsAndFindMaxIntensityInEveryTimeWindow( neg.getMetlinIonsOfChemical(AnalysisHelper.constructChemicalAndScanTypeName(chemicalName, ScanData.KIND.NEG_CONTROL)).get(chemicalName), COMPRESSION_CONSTANT).getLeft(); negativeIntensityTimes.add(negativeIntensityTimeValues); } // Get the RMS of the negative intensity times List<XZ> rmsOfNegativeValues = rmsOfIntensityTimeGraphs(negativeIntensityTimes); Double maxSNR = 0.0; Double maxTime = 0.0; Double peakIntensity = 0.0; // For each of the peaks detected in the positive control, find the spectral intensity values from the negative // controls and calculate SNR based on that. for (XZ positivePosition : positiveIntensityTime) { Double time = positivePosition.getTime(); XZ negativeControlPosition = null; for (XZ position : rmsOfNegativeValues) { if (position.getTime() > time - POSITION_TIME_WINDOW_IN_SECONDS && position.getTime() < time + POSITION_TIME_WINDOW_IN_SECONDS) { negativeControlPosition = position; break; } } Double snr; if (negativeControlPosition == null) { LOGGER.error( "There is no intensity value at this time range for the negative control, which is not expected"); snr = 0.0; } else { snr = Math.pow(positivePosition.getIntensity() / negativeControlPosition.getIntensity(), 2); } if (snr > maxSNR) { maxSNR = snr; maxTime = time; peakIntensity = positiveTimeToMaxPeak.get(positivePosition.getTime()); } } result.put(chemicalName, Pair.of(new XZ(maxTime, peakIntensity), maxSNR)); } return result; } /** * This function takes in a standard molecules's intensity vs time data and a collection of negative controls data * and plots the SNR value at each time period, assuming the time jitter effects are negligible (more info on this * is here: https://github.com/20n/act/issues/136). Based on the snr values, it rank orders the metlin ions of the * molecule. * @param ionToIntensityData A map of chemical to intensity/time data * @param standardChemical The chemical that is the standard of analysis * @return A sorted linked hash map of Metlin ion to (intensity, time) pairs from highest intensity to lowest */ public static LinkedHashMap<String, XZ> performSNRAnalysisAndReturnMetlinIonsRankOrderedBySNR( ChemicalToMapOfMetlinIonsToIntensityTimeValues ionToIntensityData, String standardChemical, Map<String, List<Double>> restrictedTimeWindows) { TreeMap<Double, List<String>> sortedIntensityToIon = new TreeMap<>(Collections.reverseOrder()); Map<String, XZ> ionToSNR = new HashMap<>(); for (String ion : ionToIntensityData.getMetlinIonsOfChemical(standardChemical).keySet()) { // We first compress the ion spectra by 5 seconds (this number was gotten from trial and error on labelled // spectra). Then, we do feature detection of peaks in the compressed data. List<XZ> standardIntensityTime = detectPeaksInIntensityTimeWaveform( compressIntensityAndTimeGraphsAndFindMaxIntensityInEveryTimeWindow( ionToIntensityData.getMetlinIonsOfChemical(standardChemical).get(ion), COMPRESSION_CONSTANT).getLeft(), PEAK_DETECTION_THRESHOLD); List<List<XZ>> negativeIntensityTimes = new ArrayList<>(); for (String chemical : ionToIntensityData.getIonList()) { if (!chemical.equals(standardChemical)) { negativeIntensityTimes.add(compressIntensityAndTimeGraphsAndFindMaxIntensityInEveryTimeWindow( ionToIntensityData.getMetlinIonsOfChemical(chemical).get(ion), COMPRESSION_CONSTANT) .getLeft()); } } List<XZ> rmsOfNegativeValues = rmsOfIntensityTimeGraphs(negativeIntensityTimes); List<Double> listOfTimeWindows = new ArrayList<>(); if (restrictedTimeWindows != null && restrictedTimeWindows.get(ion) != null) { listOfTimeWindows.addAll(restrictedTimeWindows.get(ion)); } Boolean canUpdateMaxSNRAndTime = true; Boolean useRestrictedTimeWindowAnalysis = false; // If there are restricted time windows, set the default to not update SNR until certain conditions are met. if (listOfTimeWindows.size() > 0) { useRestrictedTimeWindowAnalysis = true; canUpdateMaxSNRAndTime = false; } Double maxSNR = 0.0; Double maxTime = 0.0; // For each of the peaks detected in the positive control, find the spectral intensity values from the negative // controls and calculate SNR based on that. for (XZ positivePosition : standardIntensityTime) { Double time = positivePosition.getTime(); XZ negativeControlPosition = null; for (XZ position : rmsOfNegativeValues) { if (position.getTime() > time - POSITION_TIME_WINDOW_IN_SECONDS && position.getTime() < time + POSITION_TIME_WINDOW_IN_SECONDS) { negativeControlPosition = position; break; } } Double snr = Math.pow(positivePosition.getIntensity() / negativeControlPosition.getIntensity(), 2); // If the given time point overlaps with one of the restricted time windows, we can update the snr calculations. for (Double restrictedTimeWindow : listOfTimeWindows) { if ((time > restrictedTimeWindow - RESTRICTED_RETENTION_TIME_WINDOW_IN_SECONDS) && (time < restrictedTimeWindow + RESTRICTED_RETENTION_TIME_WINDOW_IN_SECONDS)) { canUpdateMaxSNRAndTime = true; break; } } if (canUpdateMaxSNRAndTime) { maxSNR = Math.max(maxSNR, snr); maxTime = Math.max(maxTime, time); } if (useRestrictedTimeWindowAnalysis) { canUpdateMaxSNRAndTime = false; } } ionToSNR.put(ion, new XZ(maxTime, maxSNR)); List<String> ionValues = sortedIntensityToIon.get(maxSNR); if (ionValues == null) { ionValues = new ArrayList<>(); sortedIntensityToIon.put(maxSNR, ionValues); } ionValues.add(ion); } LinkedHashMap<String, XZ> result = new LinkedHashMap<>(sortedIntensityToIon.size()); for (Map.Entry<Double, List<String>> entry : sortedIntensityToIon.entrySet()) { List<String> ions = entry.getValue(); for (String ion : ions) { result.put(ion, ionToSNR.get(ion)); } } return result; } public static void printIntensityTimeGraphInCSVFormat(List<XZ> values, String fileName) throws Exception { FileWriter chartWriter = new FileWriter(fileName); chartWriter.append("Intensity, Time"); chartWriter.append(NEW_LINE_SEPARATOR); for (XZ point : values) { chartWriter.append(point.getIntensity().toString()); chartWriter.append(COMMA_DELIMITER); chartWriter.append(point.getTime().toString()); chartWriter.append(NEW_LINE_SEPARATOR); } chartWriter.flush(); chartWriter.close(); } /** * This function checks if there are overlaps between two intensity and time charts (peak values) in the time domain. * The algorithm itself run O(n^2), but this is OK since the inputs are peak values, which on maximum are in the order * of 2 magnitudes (ie count < 100). * @param intensityAndTimeA A list of XZ values. * @param intensityAndTimeB A list of XZ values. * @param thresholdTime This parameter is used to isolate by how much time difference between the peaks is deemed * OK for a positive detection. * @return True if there is an overlap in peaks between the two charts. */ public static boolean doPeaksOverlap(List<XZ> intensityAndTimeA, List<XZ> intensityAndTimeB, Double thresholdTime) { for (XZ point : intensityAndTimeB) { Double time = point.getTime(); for (XZ referencePoint : intensityAndTimeA) { Double referenceTime = referencePoint.getTime(); if ((time > referenceTime - thresholdTime) && (time < referenceTime + thresholdTime)) { return true; } } } return false; } /** * This function is a modification of this peak detection algorithm described here: http://www.billauer.co.il/peakdet.html. * Instead of using the first derivative's change in sign to detect a peak (which has a lot more false positives), * the algorithm detects peaks by making sure that on the adjacent side of a potential peak, there are valleys. * @param intensityAndTimeValues A list of pairs of double of intensity and time. * @param threshold This threshold is used to detect peaks and valleys. * @return A sorted list of XZ values corresponding to the peaks in the input values in ascending * sorted order according to intensity. */ public static List<XZ> detectPeaksInIntensityTimeWaveform(List<XZ> intensityAndTimeValues, Double threshold) { Double minIntensity = Double.MAX_VALUE; Double maxIntensity = -Double.MAX_VALUE; Double maxTime = 0.0; Double delta = threshold; ArrayList<XZ> result = new ArrayList<>(); boolean expectingPeak = true; for (XZ val : intensityAndTimeValues) { Double intensity = val.getIntensity(); Double time = val.getTime(); if (intensity > maxIntensity) { maxIntensity = intensity; maxTime = time; } if (intensity < minIntensity) { minIntensity = intensity; } if (expectingPeak) { // Since the current intensity has dropped by a reasonable amount, the last recorded maxIntensity // was a peak. So record that data. if (intensity < maxIntensity - delta) { result.add(new XZ(maxTime, maxIntensity)); // The minIntensity is updated because all the previous minIntensity was on the left side of // the recently recorded peak, hence irrelevant. The current intensity is therefore the lowest // intensity value right after the peak. minIntensity = intensity; // Since we just added a new peak to the result set, a valley should follow. Therefore, do not // look for another peak now. expectingPeak = false; } } else { if (intensity > maxIntensity - delta) { // Reset maxIntensity and maxTime once valley has been found. maxIntensity = intensity; maxTime = time; // Since we just detected a valley, we now should expect a peak. expectingPeak = true; } } } // Sort in descending order of intensity Collections.sort(result, new Comparator<XZ>() { @Override public int compare(XZ o1, XZ o2) { return o2.getIntensity().compareTo(o1.getIntensity()); } }); return result; } // The peak detection value was selected after testing it among various intensity time values and choosing // a constant that did not let too many false positive peaks through but was selective enough to detect a // reasonable number of peaks for downstream processing. public static final Double PEAK_DETECTION_THRESHOLD = 250.0d; // We chose the 3 best peaks since after 3, since we almost never check for comparisons between the 4th best peak // in the standards chromatogram vs other results. private static final Integer NUMBER_OF_BEST_PEAKS_TO_SELECTED_FROM = 3; // This value indicates to how much error we can tolerate between peak intensity times (in seconds). private static final Integer TIME_SKEW_CORRECTION = 1; /** * This function picks the best retention time among the best peaks from the standard wells. The algorithm is * looking for the following heuristics for standard well peak detection: a) a great peak profile * b) magnitude of peak is high c) the well is not from MeOH media. It implements this by picking the global * 3 best peaks from ALL the standard wells which are not in MeOH media using a peak feature detector. It then * compares overlaps between these peaks against the local 3 best peaks of the negative controls and positive samples. * If there is an overlap, we have detected a positive signal. * @param standardWells The list of standard wells to benchmark from * @param representativeMetlinIon This is the metlin ion that is used for the analysis, usually it is the best * metlin ion picked up an algorithm among the standard well scans. * @param positiveAndNegativeWells These are positive and negative wells against which the retention times are * compared to see for overlaps. * @return A map of Scandata to XZ values for those signals where peaks match between the standard and pos/neg runs. */ public static Map<ScanData<LCMSWell>, XZ> pickBestRepresentativeRetentionTimeFromStandardWells( List<ScanData<StandardWell>> standardWells, String representativeMetlinIon, List<ScanData<LCMSWell>> positiveAndNegativeWells) { List<XZ> bestStandardPeaks = new ArrayList<>(); for (ScanData<StandardWell> well : standardWells) { if (well.getWell() != null) { // For retention times, select standard runs where the media is not MeOH since // MeOH has a lot more skew in retention time than other media. Moreover, none // of the feeding runs have their media as MeOH. if (well.getWell().getMedia() == null || !well.getWell().getMedia().equals("MeOH")) { bestStandardPeaks.addAll(detectPeaksInIntensityTimeWaveform( well.getMs1ScanResults().getIonsToSpectra().get(representativeMetlinIon), PEAK_DETECTION_THRESHOLD)); } } } // Sort in descending order of intensity Collections.sort(bestStandardPeaks, new Comparator<XZ>() { @Override public int compare(XZ o1, XZ o2) { return o2.getIntensity().compareTo(o1.getIntensity()); } }); Map<ScanData<LCMSWell>, XZ> result = new HashMap<>(); // Select from the top peaks in the standards run for (ScanData<LCMSWell> well : positiveAndNegativeWells) { List<XZ> topPeaksOfSample = detectPeaksInIntensityTimeWaveform( well.getMs1ScanResults().getIonsToSpectra().get(representativeMetlinIon), PEAK_DETECTION_THRESHOLD); for (XZ topPeak : bestStandardPeaks.subList(0, NUMBER_OF_BEST_PEAKS_TO_SELECTED_FROM - 1)) { int count = topPeaksOfSample.size() >= NUMBER_OF_BEST_PEAKS_TO_SELECTED_FROM ? NUMBER_OF_BEST_PEAKS_TO_SELECTED_FROM - 1 : topPeaksOfSample.size(); // Collisions do not matter here since we are just going to pick the highest intensity peak match, so ties // are arbitarily broker based on the order for access in the for loop below. TreeMap<Double, XZ> intensityToIntensityTimeValue = new TreeMap<>(Collections.reverseOrder()); for (int i = 0; i < count; i++) { if (topPeaksOfSample.get(i).getTime() > topPeak.getTime() - TIME_SKEW_CORRECTION && topPeaksOfSample.get(i).getTime() < topPeak.getTime() + TIME_SKEW_CORRECTION) { // There has been significant overlap in peaks between standard and sample. intensityToIntensityTimeValue.put(topPeaksOfSample.get(i).getIntensity(), topPeaksOfSample.get(i)); } } if (intensityToIntensityTimeValue.keySet().size() > 0) { // Get the best peak overlap based on the largest magnitude intensity result.put(well, intensityToIntensityTimeValue.firstEntry().getValue()); } } } return result; } }