Java tutorial
/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.lcms.db.io.report; /** * This class represents the results of an Ion analyis without provenance information or supporting data for negative * results. It should primarily be used to communicate positive LCMS findings with downstream modules. * * Example: * <pre> { "results" : [ { "_id" : 1, "mass_charge" : 331.13876999999997, "valid" : false, "molecules" : [ { "inchi" : "InChI=1S/C15H22O8/c1-20-7-11-12(17)13(18)14(19)15(23-11)22-6-8-3-4-9(16)10(5-8)21-2/h3-5,11-19H,6-7H2,1-2H3/t11-,12-,13+,14-,15-/m1/s1", "ion" : "M+H", "plot" : "331.13876999999997_37-1669-1670-_CHEM_6170.pdf", "snr" : 224.9610985335781, "time" : 208.54700088500977, "intensity" : 6954.61328125 }] }] } </pre> */ import com.act.biointerpretation.l2expansion.L2Prediction; import com.act.lcms.db.analysis.HitOrMissFilterAndTransformer; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import org.apache.commons.lang3.tuple.Pair; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; public class IonAnalysisInterchangeModel { // An LCMS result. // The idea of NO_DATA is to indicate if we query on a molecule with a mass on which no analysis was done on, to // distinguish this case from an actual calculated MISS. public enum LCMS_RESULT { HIT, MISS, NO_DATA } public enum METRIC { SNR, INTENSITY, TIME } @JsonProperty("results") private List<ResultForMZ> results; private Map<String, Boolean> inchiToIsHit; public IonAnalysisInterchangeModel() { results = new ArrayList<>(); inchiToIsHit = new HashMap<>(); } public void loadResultsFromFile(File inputFile) throws IOException { this.results = OBJECT_MAPPER.readValue(inputFile, IonAnalysisInterchangeModel.class).getResults(); this.populateInchiToIsHit(); } /** * Populates a map from all the inchis analyzed in the corpus to true if they are and LCMS hit, or false if not. * An inchi is considered a hit if any considered ion of that inchi has a MZ value that is a hit. */ private void populateInchiToIsHit() { this.inchiToIsHit = new HashMap<>(); for (ResultForMZ resultForMZ : results) { Boolean isHit = resultForMZ.isValid; for (HitOrMiss molecule : resultForMZ.getMolecules()) { // If the inchi is already a hit, then we do not want to override // its hit entry with a possible miss on a different adduct ion for // the same molecule. We check multiple metlin ions for each // molecules and some may show and others not. In an ideal world // with high concentrations "most" adduct ions would show and we // could take an AND (see https://github.com/20n/act/issues/383 and // https://github.com/20n/act/issues/370#issuecomment-240289674) // but when low concentrations exist we would rather take an OR and // be conservative, avoiding false negatives. if (this.inchiToIsHit.get(molecule.getInchi()) == null || !this.inchiToIsHit.get(molecule.getInchi())) { this.inchiToIsHit.put(molecule.getInchi(), isHit); } } } } public void writeToJsonFile(File outputFile) throws IOException { try (BufferedWriter predictionWriter = new BufferedWriter(new FileWriter(outputFile))) { OBJECT_MAPPER.writeValue(predictionWriter, this); } } /** * This function is used to compute log frequency distribution of the ion model vs a metric. * @param metric The metric on which the frequency distribution is plotted * @return A map of a range to the count of molecules that get bucketed in that range */ public Map<Pair<Double, Double>, Integer> computeLogFrequencyDistributionOfMoleculeCountToMetric( METRIC metric) { Map<Pair<Double, Double>, Integer> rangeToHitCount = new HashMap<>(); // This variable represents the total number of statistics that have zero values. Integer countOfZeroStats = 0; // This statistic represents the log value of the min statistic. Double minLogValue = Double.MAX_VALUE; for (ResultForMZ resultForMZ : this.getResults()) { for (HitOrMiss molecule : resultForMZ.getMolecules()) { Double power = 0.0; switch (metric) { case TIME: power = Math.log10(molecule.getTime()); break; case INTENSITY: power = Math.log10(molecule.getIntensity()); break; case SNR: power = Math.log10(molecule.getSnr()); break; } if (power.equals(Double.NEGATIVE_INFINITY)) { // We know the statistic was 0 here. countOfZeroStats++; break; } Double floor = Math.floor(power); Double lowerBound = Math.pow(10.0, floor); Double upperBound = Math.pow(10.0, floor + 1); minLogValue = Math.min(minLogValue, lowerBound); Pair<Double, Double> key = Pair.of(lowerBound, upperBound); rangeToHitCount.compute(key, (k, v) -> (v == null) ? 1 : v + 1); } // We count the total number of zero statistics and put them in the 0 to minLog metric bucket. if (countOfZeroStats > 0) { Pair<Double, Double> key = Pair.of(0.0, minLogValue); rangeToHitCount.put(key, countOfZeroStats); } } return rangeToHitCount; } /** * Returns HIT or MISS if the inchi is in the precalculated inchi->hit map, or NO_DATA if the inchi is not. * This will let us know if there has been any change in the inchi's form since the initial calculation, instead of * just silently returning a miss. * * @param inchi The inchi of the molecule. * @return The LCMS result. */ public LCMS_RESULT isMoleculeAHit(String inchi) { if (this.inchiToIsHit.get(inchi) == null) { return LCMS_RESULT.NO_DATA; } return this.inchiToIsHit.get(inchi) ? LCMS_RESULT.HIT : LCMS_RESULT.MISS; } /** * This function results all the inchis from the model. * @return A set of inchis */ @JsonIgnore public Set<String> getAllInchis() { Set<String> result = new HashSet<>(); for (ResultForMZ resultForMZ : this.getResults()) { result.addAll(resultForMZ.getMolecules().stream().map(molecule -> molecule.getInchi()) .collect(Collectors.toList())); } return result; } /** * Calculate whether a given prediction is an LCMS hit or not. * * TODO: think through our general approach to multiple substrate reactions when necessary. * We'll need to balance the possibilities of false positives and false negatives- one idea would be to return * a score based on the number of confirmed products of the reaction. * * @param prediction The prediction from the corpus. * @return True if all products are LCMS hits. */ public LCMS_RESULT getLcmsDataForPrediction(L2Prediction prediction) { List<String> productInchis = prediction.getProductInchis(); for (String product : productInchis) { // If any of the results have no data, return NO_DATA. Such results shouldn't happen for now, so the caller will // likely throw an exception if this happens. if (this.isMoleculeAHit(product).equals(IonAnalysisInterchangeModel.LCMS_RESULT.NO_DATA)) { return LCMS_RESULT.NO_DATA; } // Otherwise, if a miss is found among the prediction's products, return it as a miss. This implements an // AND among the products of the prediction- all must be present to register as a hit. This is motivated by the // fact that our only current multiple-product reaction produces one significant product, and one constant // cofactor. We verified that in both urine and saliva, the cofactor is present in our samples, so // an OR approach here would return a HIT for every prediction of that RO. if (this.isMoleculeAHit(product).equals(IonAnalysisInterchangeModel.LCMS_RESULT.MISS)) { return LCMS_RESULT.MISS; } } // If every prediction is a HIT, return HIT. return LCMS_RESULT.HIT; } /** * This function takes in multiple LCMS mining results (in the IonAnalysisInterchangeModel format), which happens * when we have multiple positive control replicates, extracts all the molecule hits from each file and applies * a filter function across the replicate hits. * @param replicateModels The list of IonAnalysisInterchangeModels to be analyzed * @param hitOrMissFilterAndTransformer This filter function takes in single/multiple HitOrMiss objects from replicates and * performs a transformation operation on them to produce one HitOrMiss object * and a boolean to keep the transformed molecule in the resulting model. * @return A resultant model that contains a list of inchis that are valid molecule hits in the input file and * pass all the thresholds. * @throws IOException */ public static IonAnalysisInterchangeModel filterAndOperateOnMoleculesFromMultipleReplicateModels( List<IonAnalysisInterchangeModel> replicateModels, HitOrMissFilterAndTransformer hitOrMissFilterAndTransformer) throws IOException { List<ResultForMZ> resultsForMZs = new ArrayList<>(); /** * Each element in deserializedResultsForPositiveReplicates now contains a list of mass charges to a list of * molecule+ion combinations for each mass charge. We consider a molecule "valid", ie a hit, if the mass charge * it is under for every element in deserializedResultsForPositiveReplicates is above the thresholds we have set. */ // Iterate through every mass charge // TODO: Consider using a parallel stream here for (int i = 0; i < replicateModels.get(0).getResults().size(); i++) { ResultForMZ representativeMZ = replicateModels.get(0).getResults().get(i); Double representativeMassCharge = representativeMZ.getMz(); int totalNumberOfMoleculesInMassChargeResult = representativeMZ.getMolecules().size(); ResultForMZ resultForMZ = new ResultForMZ(representativeMassCharge); resultForMZ.setId(representativeMZ.getId()); // TODO: Take out the isValid field since it does not convey useful information for such post processing files. resultForMZ.setIsValid(representativeMZ.getIsValid()); // For each mass charge, iterate through each molecule under the mass charge for (int j = 0; j < totalNumberOfMoleculesInMassChargeResult; j++) { Pair<HitOrMiss, Boolean> transformedAndIsRetainedMolecule; List<HitOrMiss> moleculesFromReplicates = new ArrayList<>(); // For each molecule, make sure it passes the threshold we set across every elem in deserializedResultsForPositiveReplicates, // ie across each positive replicate + neg control experiment results for (int k = 0; k < replicateModels.size(); k++) { ResultForMZ sampleRepresentativeMz = replicateModels.get(k).getResults().get(i); // Since we are comparing across replicate files, we expect each ResultForMZ element in the each replicate's // IonAnalysisInterchangeModel to be in the same order as other replicates. We check if the mass charges are the // same across the samples to make sure the replicates aligned correctly. if (!sampleRepresentativeMz.getMz().equals(representativeMassCharge)) { throw new RuntimeException(String.format( "The replicates are not ordered correctly since %f and %f are not " + "the equal.", sampleRepresentativeMz.getMz(), representativeMassCharge)); } HitOrMiss molecule = sampleRepresentativeMz.getMolecules().get(j); moleculesFromReplicates.add(molecule); } transformedAndIsRetainedMolecule = hitOrMissFilterAndTransformer.apply(moleculesFromReplicates); // Check if the filter function wants to throw out the molecule. If not, then add the molecule to the final result. if (transformedAndIsRetainedMolecule.getRight()) { resultForMZ.addMolecule(transformedAndIsRetainedMolecule.getLeft()); } } resultsForMZs.add(resultForMZ); } IonAnalysisInterchangeModel resultModel = new IonAnalysisInterchangeModel(); resultModel.setResults(resultsForMZs); return resultModel; } /** * This function takes in a single LCMS result (in the IonAnalysisInterchangeModel format) and extracts all the * molecule hits from the file and applies a filter function on the model. * @param replicateModel The IonAnalysisInterchangeModel to be analyzed * @param hitOrMissFilterAndTransformer This filter function takes in single/multiple HitOrMiss objects from replicates and * performs a transformation operation on them to produce one HitOrMiss object * and a boolean to keep the transformed molecule in the resulting model. * @return A resultant model that contains a list of inchis that are valid molecule hits in the input file and * pass all the thresholds. * @throws IOException */ public static IonAnalysisInterchangeModel filterAndOperateOnMoleculesFromModel( IonAnalysisInterchangeModel replicateModel, HitOrMissFilterAndTransformer hitOrMissFilterAndTransformer) throws IOException { List<ResultForMZ> resultsForMZs = new ArrayList<>(); // Iterate through every mass charge // TODO: Consider using a parallel stream here for (int i = 0; i < replicateModel.getResults().size(); i++) { ResultForMZ representativeMZ = replicateModel.getResults().get(i); Double representativeMassCharge = representativeMZ.getMz(); int totalNumberOfMoleculesInMassChargeResult = representativeMZ.getMolecules().size(); ResultForMZ resultForMZ = new ResultForMZ(representativeMassCharge); resultForMZ.setId(representativeMZ.getId()); // TODO: Take out the isValid field since it does not convey useful information for such post processing files. resultForMZ.setIsValid(representativeMZ.getIsValid()); // For each mass charge, iterate through each molecule under the mass charge for (int j = 0; j < totalNumberOfMoleculesInMassChargeResult; j++) { Pair<HitOrMiss, Boolean> transformedAndIsRetainedMolecule; List<HitOrMiss> molecules = replicateModel.getResults().get(i).getMolecules(); HitOrMiss molecule = molecules.get(j); transformedAndIsRetainedMolecule = hitOrMissFilterAndTransformer.apply(molecule); // Check if the filter function wants to throw out the molecule. If not, then add the molecule to the final result. if (transformedAndIsRetainedMolecule.getRight()) { resultForMZ.addMolecule(transformedAndIsRetainedMolecule.getLeft()); } } resultsForMZs.add(resultForMZ); } IonAnalysisInterchangeModel resultModel = new IonAnalysisInterchangeModel(); resultModel.setResults(resultsForMZs); return resultModel; } /** * This function loads in a serialized IonAnalysisInterchangeModel and deserializes it. * @param filepath File path to the serialized IonAnalysisInterchangeModels * @return An IonAnalysisInterchangeModel corresponding to the file. * @throws IOException */ public static IonAnalysisInterchangeModel loadIonAnalysisInterchangeModelFromFile(String filepath) throws IOException { IonAnalysisInterchangeModel model = new IonAnalysisInterchangeModel(); model.loadResultsFromFile(new File(filepath)); return model; } /** * This function is used to get the superset inchis from various models representing the different lcms ion runs for * a given chemical on a single replicate * @param models IonAnalysisInterchangeModels for each ionic variant * @param snrThreshold The snr threshold * @param intensityThreshold The intensity threshold * @param timeThreshold The time threshold * @return The superset of all inchis in each ionic variant file. * @throws IOException */ public static IonAnalysisInterchangeModel getSupersetOfIonicVariants(List<IonAnalysisInterchangeModel> models, Double snrThreshold, Double intensityThreshold, Double timeThreshold) throws IOException { IonAnalysisInterchangeModel resultModel = new IonAnalysisInterchangeModel(); List<ResultForMZ> resultForMZList = new ArrayList<>(); for (IonAnalysisInterchangeModel analysisInterchangeModel : models) { for (ResultForMZ resultForMZ : analysisInterchangeModel.getResults()) { List<HitOrMiss> allMoleculesThatPass = new ArrayList<>(); for (HitOrMiss molecule : resultForMZ.getMolecules()) { if (molecule.getIntensity() > intensityThreshold && molecule.getSnr() > snrThreshold && molecule.getTime() > timeThreshold) { allMoleculesThatPass.add(molecule); } } if (allMoleculesThatPass.size() > 0) { ResultForMZ newResultForMZ = new ResultForMZ(resultForMZ.getMz()); newResultForMZ.addMolecules(allMoleculesThatPass); resultForMZList.add(newResultForMZ); } } } resultModel.setResults(resultForMZList); return resultModel; } /** * This function is used for getting all inchis that are hits in the corpus * @param snrThreshold The snr threshold * @param intensityThreshold The intensity threshold * @param timeThreshold The time threshold * @return A set of inchis */ public Set<String> getAllMoleculeHits(Double snrThreshold, Double intensityThreshold, Double timeThreshold) { Set<String> resultSet = new HashSet<>(); for (ResultForMZ resultForMZ : results) { for (HitOrMiss hitOrMiss : resultForMZ.getMolecules()) { if (hitOrMiss.getIntensity() > intensityThreshold && hitOrMiss.getSnr() > snrThreshold && hitOrMiss.getTime() > timeThreshold) { resultSet.add(hitOrMiss.getInchi()); } } } return resultSet; } public IonAnalysisInterchangeModel(List<ResultForMZ> results) { this.results = results; this.populateInchiToIsHit(); } public List<ResultForMZ> getResults() { return results; } protected void setResults(List<ResultForMZ> results) { this.results = results; } private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); static { OBJECT_MAPPER.enable(SerializationFeature.INDENT_OUTPUT); } public static class ResultForMZ { private static final AtomicLong ID_COUNTER = new AtomicLong(0); @JsonProperty("_id") private Long id; @JsonProperty("mass_charge") private Double mz; @JsonProperty("valid") private Boolean isValid; @JsonProperty("molecules") private List<HitOrMiss> molecules; // For deserialization. protected ResultForMZ() { } protected ResultForMZ(Long id, Double mz, List<HitOrMiss> molecules, Boolean hit) { this.id = id; this.mz = mz; this.molecules = molecules; this.isValid = hit; } public ResultForMZ(Double mz, List<HitOrMiss> molecules, Boolean hit) { this.id = ID_COUNTER.incrementAndGet(); this.mz = mz; this.molecules = molecules; this.isValid = hit; } public ResultForMZ(Double mz) { this.id = ID_COUNTER.incrementAndGet(); this.mz = mz; this.molecules = new ArrayList<>(); this.isValid = false; } public ResultForMZ(Double mz, Boolean hit) { this.id = ID_COUNTER.incrementAndGet(); this.mz = mz; this.isValid = hit; this.molecules = new ArrayList<>(); } public Long getId() { return id; } protected void setId(Long id) { this.id = id; } public Double getMz() { return mz; } protected void setMz(Double mz) { this.mz = mz; } public List<HitOrMiss> getMolecules() { return molecules; } protected void setMolecules(List<HitOrMiss> hits) { this.molecules = new ArrayList<>(hits); // Copy to ensure sole ownership. } @JsonIgnore public void addMolecule(HitOrMiss hit) { this.molecules.add(hit); } @JsonIgnore public void addMolecules(List<HitOrMiss> hits) { this.molecules.addAll(hits); } public Boolean getIsValid() { return isValid; } public void setIsValid(Boolean hit) { isValid = hit; } } public static class HitOrMiss { @JsonProperty("inchi") private String inchi; @JsonProperty("ion") private String ion; @JsonProperty("plot") private String plot; @JsonProperty("snr") private Double snr; @JsonProperty("time") private Double time; @JsonProperty("intensity") private Double intensity; // For deserialization. public HitOrMiss() { } public HitOrMiss(String inchi, String ion, Double snr, Double time, Double intensity, String plot) { this.inchi = inchi; this.ion = ion; this.snr = snr; this.time = time; this.intensity = intensity; this.plot = plot; } public String getInchi() { return inchi; } public void setInchi(String inchi) { this.inchi = inchi; } public String getIon() { return ion; } public void setIon(String ion) { this.ion = ion; } public Double getSnr() { return snr; } public void setSnr(Double snr) { this.snr = snr; } public Double getTime() { return time; } public void setTime(Double time) { this.time = time; } public Double getIntensity() { return intensity; } public void setIntensity(Double intensity) { this.intensity = intensity; } public String getPlot() { return plot; } public void setPlot(String plot) { this.plot = plot; } } }