com.act.lcms.db.io.report.IonAnalysisInterchangeModel.java Source code

Java tutorial

Introduction

Here is the source code for com.act.lcms.db.io.report.IonAnalysisInterchangeModel.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms.db.io.report;

/**
 * This class represents the results of an Ion analyis without provenance information or supporting data for negative
 * results.  It should primarily be used to communicate positive LCMS findings with downstream modules.
 *
 * Example:
 * <pre>
  {
 "results" : [ {
  "_id" : 1,
  "mass_charge" : 331.13876999999997,
  "valid" : false,
  "molecules" : [ {
   "inchi" : "InChI=1S/C15H22O8/c1-20-7-11-12(17)13(18)14(19)15(23-11)22-6-8-3-4-9(16)10(5-8)21-2/h3-5,11-19H,6-7H2,1-2H3/t11-,12-,13+,14-,15-/m1/s1",
   "ion" : "M+H",
   "plot" : "331.13876999999997_37-1669-1670-_CHEM_6170.pdf",
   "snr" : 224.9610985335781,
   "time" : 208.54700088500977,
   "intensity" : 6954.61328125
  }]
 }]
 }
 </pre>
 */

import com.act.biointerpretation.l2expansion.L2Prediction;
import com.act.lcms.db.analysis.HitOrMissFilterAndTransformer;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.apache.commons.lang3.tuple.Pair;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;

public class IonAnalysisInterchangeModel {

    // An LCMS result.
    // The idea of NO_DATA is to indicate if we query on a molecule with a mass on which no analysis was done on, to
    // distinguish this case from an actual calculated MISS.
    public enum LCMS_RESULT {
        HIT, MISS, NO_DATA
    }

    public enum METRIC {
        SNR, INTENSITY, TIME
    }

    @JsonProperty("results")
    private List<ResultForMZ> results;
    private Map<String, Boolean> inchiToIsHit;

    public IonAnalysisInterchangeModel() {
        results = new ArrayList<>();
        inchiToIsHit = new HashMap<>();
    }

    public void loadResultsFromFile(File inputFile) throws IOException {
        this.results = OBJECT_MAPPER.readValue(inputFile, IonAnalysisInterchangeModel.class).getResults();
        this.populateInchiToIsHit();
    }

    /**
     * Populates a map from all the inchis analyzed in the corpus to true if they are and LCMS hit, or false if not.
     * An inchi is considered a hit if any considered ion of that inchi has a MZ value that is a hit.
     */
    private void populateInchiToIsHit() {
        this.inchiToIsHit = new HashMap<>();

        for (ResultForMZ resultForMZ : results) {
            Boolean isHit = resultForMZ.isValid;
            for (HitOrMiss molecule : resultForMZ.getMolecules()) {
                // If the inchi is already a hit, then we do not want to override
                // its hit entry with a possible miss on a different adduct ion for
                // the same molecule. We check multiple metlin ions for each
                // molecules and some may show and others not. In an ideal world
                // with high concentrations "most" adduct ions would show and we
                // could take an AND (see https://github.com/20n/act/issues/383 and
                // https://github.com/20n/act/issues/370#issuecomment-240289674)
                // but when low concentrations exist we would rather take an OR and
                // be conservative, avoiding false negatives.
                if (this.inchiToIsHit.get(molecule.getInchi()) == null
                        || !this.inchiToIsHit.get(molecule.getInchi())) {
                    this.inchiToIsHit.put(molecule.getInchi(), isHit);
                }
            }
        }
    }

    public void writeToJsonFile(File outputFile) throws IOException {
        try (BufferedWriter predictionWriter = new BufferedWriter(new FileWriter(outputFile))) {
            OBJECT_MAPPER.writeValue(predictionWriter, this);
        }
    }

    /**
     * This function is used to compute log frequency distribution of the ion model vs a metric.
     * @param metric The metric on which the frequency distribution is plotted
     * @return A map of a range to the count of molecules that get bucketed in that range
     */
    public Map<Pair<Double, Double>, Integer> computeLogFrequencyDistributionOfMoleculeCountToMetric(
            METRIC metric) {
        Map<Pair<Double, Double>, Integer> rangeToHitCount = new HashMap<>();

        // This variable represents the total number of statistics that have zero values.
        Integer countOfZeroStats = 0;

        // This statistic represents the log value of the min statistic.
        Double minLogValue = Double.MAX_VALUE;

        for (ResultForMZ resultForMZ : this.getResults()) {
            for (HitOrMiss molecule : resultForMZ.getMolecules()) {

                Double power = 0.0;

                switch (metric) {
                case TIME:
                    power = Math.log10(molecule.getTime());
                    break;
                case INTENSITY:
                    power = Math.log10(molecule.getIntensity());
                    break;
                case SNR:
                    power = Math.log10(molecule.getSnr());
                    break;
                }

                if (power.equals(Double.NEGATIVE_INFINITY)) {
                    // We know the statistic was 0 here.
                    countOfZeroStats++;
                    break;
                }

                Double floor = Math.floor(power);
                Double lowerBound = Math.pow(10.0, floor);
                Double upperBound = Math.pow(10.0, floor + 1);

                minLogValue = Math.min(minLogValue, lowerBound);
                Pair<Double, Double> key = Pair.of(lowerBound, upperBound);
                rangeToHitCount.compute(key, (k, v) -> (v == null) ? 1 : v + 1);
            }

            // We count the total number of zero statistics and put them in the 0 to minLog metric bucket.
            if (countOfZeroStats > 0) {
                Pair<Double, Double> key = Pair.of(0.0, minLogValue);
                rangeToHitCount.put(key, countOfZeroStats);
            }
        }

        return rangeToHitCount;
    }

    /**
     * Returns HIT or MISS if the inchi is in the precalculated inchi->hit map, or NO_DATA if the inchi is not.
     * This will let us know if there has been any change in the inchi's form since the initial calculation, instead of
     * just silently returning a miss.
     *
     * @param inchi The inchi of the molecule.
     * @return The LCMS result.
     */
    public LCMS_RESULT isMoleculeAHit(String inchi) {
        if (this.inchiToIsHit.get(inchi) == null) {
            return LCMS_RESULT.NO_DATA;
        }
        return this.inchiToIsHit.get(inchi) ? LCMS_RESULT.HIT : LCMS_RESULT.MISS;
    }

    /**
     * This function results all the inchis from the model.
     * @return A set of inchis
     */
    @JsonIgnore
    public Set<String> getAllInchis() {
        Set<String> result = new HashSet<>();
        for (ResultForMZ resultForMZ : this.getResults()) {
            result.addAll(resultForMZ.getMolecules().stream().map(molecule -> molecule.getInchi())
                    .collect(Collectors.toList()));
        }
        return result;
    }

    /**
     * Calculate whether a given prediction is an LCMS hit or not.
     *
     * TODO: think through our general approach to multiple substrate reactions when necessary.
     * We'll need to balance the possibilities of false positives and false negatives- one idea would be to return
     * a score based on the number of confirmed products of the reaction.
     *
     * @param prediction The prediction from the corpus.
     * @return True if all products are LCMS hits.
     */
    public LCMS_RESULT getLcmsDataForPrediction(L2Prediction prediction) {
        List<String> productInchis = prediction.getProductInchis();
        for (String product : productInchis) {
            // If any of the results have no data, return NO_DATA. Such results shouldn't happen for now, so the caller will
            // likely throw an exception if this happens.
            if (this.isMoleculeAHit(product).equals(IonAnalysisInterchangeModel.LCMS_RESULT.NO_DATA)) {
                return LCMS_RESULT.NO_DATA;
            }
            // Otherwise, if a miss is found among the prediction's products, return it as a miss.  This implements an
            // AND among the products of the prediction- all must be present to register as a hit. This is motivated by the
            // fact that our only current multiple-product reaction produces one significant product, and one constant
            // cofactor. We verified that in both urine and saliva, the cofactor is present in our samples, so
            // an OR approach here would return a HIT for every prediction of that RO.
            if (this.isMoleculeAHit(product).equals(IonAnalysisInterchangeModel.LCMS_RESULT.MISS)) {
                return LCMS_RESULT.MISS;
            }
        }
        // If every prediction is a HIT, return HIT.
        return LCMS_RESULT.HIT;
    }

    /**
     * This function takes in multiple LCMS mining results  (in the IonAnalysisInterchangeModel format), which happens
     * when we have multiple positive control replicates, extracts all the molecule hits from each file and applies
     * a filter function across the replicate hits.
     * @param replicateModels The list of IonAnalysisInterchangeModels to be analyzed
     * @param hitOrMissFilterAndTransformer This filter function takes in single/multiple HitOrMiss objects from replicates and
     *                                   performs a transformation operation on them to produce one HitOrMiss object
     *                                   and a boolean to keep the transformed molecule in the resulting model.
     * @return A resultant model that contains a list of inchis that are valid molecule hits in the input file and
     *          pass all the thresholds.
     * @throws IOException
     */
    public static IonAnalysisInterchangeModel filterAndOperateOnMoleculesFromMultipleReplicateModels(
            List<IonAnalysisInterchangeModel> replicateModels,
            HitOrMissFilterAndTransformer hitOrMissFilterAndTransformer) throws IOException {

        List<ResultForMZ> resultsForMZs = new ArrayList<>();

        /**
         * Each element in deserializedResultsForPositiveReplicates now contains a list of mass charges to a list of
         * molecule+ion combinations for each mass charge. We consider a molecule "valid", ie a hit, if the mass charge
         * it is under for every element in deserializedResultsForPositiveReplicates is above the thresholds we have set.
         */

        // Iterate through every mass charge
        // TODO: Consider using a parallel stream here
        for (int i = 0; i < replicateModels.get(0).getResults().size(); i++) {
            ResultForMZ representativeMZ = replicateModels.get(0).getResults().get(i);
            Double representativeMassCharge = representativeMZ.getMz();
            int totalNumberOfMoleculesInMassChargeResult = representativeMZ.getMolecules().size();

            ResultForMZ resultForMZ = new ResultForMZ(representativeMassCharge);
            resultForMZ.setId(representativeMZ.getId());

            // TODO: Take out the isValid field since it does not convey useful information for such post processing files.
            resultForMZ.setIsValid(representativeMZ.getIsValid());

            // For each mass charge, iterate through each molecule under the mass charge
            for (int j = 0; j < totalNumberOfMoleculesInMassChargeResult; j++) {

                Pair<HitOrMiss, Boolean> transformedAndIsRetainedMolecule;

                List<HitOrMiss> moleculesFromReplicates = new ArrayList<>();

                // For each molecule, make sure it passes the threshold we set across every elem in deserializedResultsForPositiveReplicates,
                // ie across each positive replicate + neg control experiment results
                for (int k = 0; k < replicateModels.size(); k++) {
                    ResultForMZ sampleRepresentativeMz = replicateModels.get(k).getResults().get(i);

                    // Since we are comparing across replicate files, we expect each ResultForMZ element in the each replicate's
                    // IonAnalysisInterchangeModel to be in the same order as other replicates. We check if the mass charges are the
                    // same across the samples to make sure the replicates aligned correctly.
                    if (!sampleRepresentativeMz.getMz().equals(representativeMassCharge)) {
                        throw new RuntimeException(String.format(
                                "The replicates are not ordered correctly since %f and %f are not " + "the equal.",
                                sampleRepresentativeMz.getMz(), representativeMassCharge));
                    }

                    HitOrMiss molecule = sampleRepresentativeMz.getMolecules().get(j);
                    moleculesFromReplicates.add(molecule);
                }

                transformedAndIsRetainedMolecule = hitOrMissFilterAndTransformer.apply(moleculesFromReplicates);

                // Check if the filter function  wants to throw out the molecule. If not, then add the molecule to the final result.
                if (transformedAndIsRetainedMolecule.getRight()) {
                    resultForMZ.addMolecule(transformedAndIsRetainedMolecule.getLeft());
                }
            }

            resultsForMZs.add(resultForMZ);
        }

        IonAnalysisInterchangeModel resultModel = new IonAnalysisInterchangeModel();
        resultModel.setResults(resultsForMZs);
        return resultModel;
    }

    /**
     * This function takes in a single LCMS result (in the IonAnalysisInterchangeModel format) and extracts all the
     * molecule hits from the file and applies a filter function on the model.
     * @param replicateModel The IonAnalysisInterchangeModel to be analyzed
     * @param hitOrMissFilterAndTransformer This filter function takes in single/multiple HitOrMiss objects from replicates and
     *                                      performs a transformation operation on them to produce one HitOrMiss object
     *                                      and a boolean to keep the transformed molecule in the resulting model.
     * @return  A resultant model that contains a list of inchis that are valid molecule hits in the input file and
     *          pass all the thresholds.
     * @throws IOException
     */
    public static IonAnalysisInterchangeModel filterAndOperateOnMoleculesFromModel(
            IonAnalysisInterchangeModel replicateModel, HitOrMissFilterAndTransformer hitOrMissFilterAndTransformer)
            throws IOException {

        List<ResultForMZ> resultsForMZs = new ArrayList<>();

        // Iterate through every mass charge
        // TODO: Consider using a parallel stream here
        for (int i = 0; i < replicateModel.getResults().size(); i++) {
            ResultForMZ representativeMZ = replicateModel.getResults().get(i);
            Double representativeMassCharge = representativeMZ.getMz();
            int totalNumberOfMoleculesInMassChargeResult = representativeMZ.getMolecules().size();

            ResultForMZ resultForMZ = new ResultForMZ(representativeMassCharge);
            resultForMZ.setId(representativeMZ.getId());

            // TODO: Take out the isValid field since it does not convey useful information for such post processing files.
            resultForMZ.setIsValid(representativeMZ.getIsValid());

            // For each mass charge, iterate through each molecule under the mass charge
            for (int j = 0; j < totalNumberOfMoleculesInMassChargeResult; j++) {
                Pair<HitOrMiss, Boolean> transformedAndIsRetainedMolecule;

                List<HitOrMiss> molecules = replicateModel.getResults().get(i).getMolecules();
                HitOrMiss molecule = molecules.get(j);
                transformedAndIsRetainedMolecule = hitOrMissFilterAndTransformer.apply(molecule);

                // Check if the filter function  wants to throw out the molecule. If not, then add the molecule to the final result.
                if (transformedAndIsRetainedMolecule.getRight()) {
                    resultForMZ.addMolecule(transformedAndIsRetainedMolecule.getLeft());
                }
            }

            resultsForMZs.add(resultForMZ);
        }

        IonAnalysisInterchangeModel resultModel = new IonAnalysisInterchangeModel();
        resultModel.setResults(resultsForMZs);
        return resultModel;
    }

    /**
     * This function loads in a serialized IonAnalysisInterchangeModel and deserializes it.
     * @param filepath File path to the serialized IonAnalysisInterchangeModels
     * @return An IonAnalysisInterchangeModel corresponding to the file.
     * @throws IOException
     */
    public static IonAnalysisInterchangeModel loadIonAnalysisInterchangeModelFromFile(String filepath)
            throws IOException {
        IonAnalysisInterchangeModel model = new IonAnalysisInterchangeModel();
        model.loadResultsFromFile(new File(filepath));
        return model;
    }

    /**
     * This function is used to get the superset inchis from various models representing the different lcms ion runs for
     * a given chemical on a single replicate
     * @param models IonAnalysisInterchangeModels for each ionic variant
     * @param snrThreshold The snr threshold
     * @param intensityThreshold The intensity threshold
     * @param timeThreshold The time threshold
     * @return The superset of all inchis in each ionic variant file.
     * @throws IOException
     */
    public static IonAnalysisInterchangeModel getSupersetOfIonicVariants(List<IonAnalysisInterchangeModel> models,
            Double snrThreshold, Double intensityThreshold, Double timeThreshold) throws IOException {

        IonAnalysisInterchangeModel resultModel = new IonAnalysisInterchangeModel();
        List<ResultForMZ> resultForMZList = new ArrayList<>();

        for (IonAnalysisInterchangeModel analysisInterchangeModel : models) {

            for (ResultForMZ resultForMZ : analysisInterchangeModel.getResults()) {
                List<HitOrMiss> allMoleculesThatPass = new ArrayList<>();

                for (HitOrMiss molecule : resultForMZ.getMolecules()) {
                    if (molecule.getIntensity() > intensityThreshold && molecule.getSnr() > snrThreshold
                            && molecule.getTime() > timeThreshold) {
                        allMoleculesThatPass.add(molecule);
                    }
                }

                if (allMoleculesThatPass.size() > 0) {
                    ResultForMZ newResultForMZ = new ResultForMZ(resultForMZ.getMz());
                    newResultForMZ.addMolecules(allMoleculesThatPass);
                    resultForMZList.add(newResultForMZ);
                }
            }
        }

        resultModel.setResults(resultForMZList);
        return resultModel;
    }

    /**
     * This function is used for getting all inchis that are hits in the corpus
     * @param snrThreshold The snr threshold
     * @param intensityThreshold The intensity threshold
     * @param timeThreshold The time threshold
     * @return A set of inchis
     */
    public Set<String> getAllMoleculeHits(Double snrThreshold, Double intensityThreshold, Double timeThreshold) {
        Set<String> resultSet = new HashSet<>();
        for (ResultForMZ resultForMZ : results) {
            for (HitOrMiss hitOrMiss : resultForMZ.getMolecules()) {
                if (hitOrMiss.getIntensity() > intensityThreshold && hitOrMiss.getSnr() > snrThreshold
                        && hitOrMiss.getTime() > timeThreshold) {
                    resultSet.add(hitOrMiss.getInchi());
                }
            }
        }
        return resultSet;
    }

    public IonAnalysisInterchangeModel(List<ResultForMZ> results) {
        this.results = results;
        this.populateInchiToIsHit();
    }

    public List<ResultForMZ> getResults() {
        return results;
    }

    protected void setResults(List<ResultForMZ> results) {
        this.results = results;
    }

    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

    static {
        OBJECT_MAPPER.enable(SerializationFeature.INDENT_OUTPUT);
    }

    public static class ResultForMZ {
        private static final AtomicLong ID_COUNTER = new AtomicLong(0);

        @JsonProperty("_id")
        private Long id;

        @JsonProperty("mass_charge")
        private Double mz;

        @JsonProperty("valid")
        private Boolean isValid;

        @JsonProperty("molecules")
        private List<HitOrMiss> molecules;

        // For deserialization.
        protected ResultForMZ() {

        }

        protected ResultForMZ(Long id, Double mz, List<HitOrMiss> molecules, Boolean hit) {
            this.id = id;
            this.mz = mz;
            this.molecules = molecules;
            this.isValid = hit;
        }

        public ResultForMZ(Double mz, List<HitOrMiss> molecules, Boolean hit) {
            this.id = ID_COUNTER.incrementAndGet();
            this.mz = mz;
            this.molecules = molecules;
            this.isValid = hit;
        }

        public ResultForMZ(Double mz) {
            this.id = ID_COUNTER.incrementAndGet();
            this.mz = mz;
            this.molecules = new ArrayList<>();
            this.isValid = false;
        }

        public ResultForMZ(Double mz, Boolean hit) {
            this.id = ID_COUNTER.incrementAndGet();
            this.mz = mz;
            this.isValid = hit;
            this.molecules = new ArrayList<>();
        }

        public Long getId() {
            return id;
        }

        protected void setId(Long id) {
            this.id = id;
        }

        public Double getMz() {
            return mz;
        }

        protected void setMz(Double mz) {
            this.mz = mz;
        }

        public List<HitOrMiss> getMolecules() {
            return molecules;
        }

        protected void setMolecules(List<HitOrMiss> hits) {
            this.molecules = new ArrayList<>(hits); // Copy to ensure sole ownership.
        }

        @JsonIgnore
        public void addMolecule(HitOrMiss hit) {
            this.molecules.add(hit);
        }

        @JsonIgnore
        public void addMolecules(List<HitOrMiss> hits) {
            this.molecules.addAll(hits);
        }

        public Boolean getIsValid() {
            return isValid;
        }

        public void setIsValid(Boolean hit) {
            isValid = hit;
        }
    }

    public static class HitOrMiss {
        @JsonProperty("inchi")
        private String inchi;

        @JsonProperty("ion")
        private String ion;

        @JsonProperty("plot")
        private String plot;

        @JsonProperty("snr")
        private Double snr;

        @JsonProperty("time")
        private Double time;

        @JsonProperty("intensity")
        private Double intensity;

        // For deserialization.
        public HitOrMiss() {

        }

        public HitOrMiss(String inchi, String ion, Double snr, Double time, Double intensity, String plot) {
            this.inchi = inchi;
            this.ion = ion;
            this.snr = snr;
            this.time = time;
            this.intensity = intensity;
            this.plot = plot;
        }

        public String getInchi() {
            return inchi;
        }

        public void setInchi(String inchi) {
            this.inchi = inchi;
        }

        public String getIon() {
            return ion;
        }

        public void setIon(String ion) {
            this.ion = ion;
        }

        public Double getSnr() {
            return snr;
        }

        public void setSnr(Double snr) {
            this.snr = snr;
        }

        public Double getTime() {
            return time;
        }

        public void setTime(Double time) {
            this.time = time;
        }

        public Double getIntensity() {
            return intensity;
        }

        public void setIntensity(Double intensity) {
            this.intensity = intensity;
        }

        public String getPlot() {
            return plot;
        }

        public void setPlot(String plot) {
            this.plot = plot;
        }
    }
}