com.act.lcms.v2.MassChargeCalculator.java Source code

Java tutorial

Introduction

Here is the source code for com.act.lcms.v2.MassChargeCalculator.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms.v2;

import com.act.lcms.MS1;
import com.act.lcms.MassCalculator;
import com.act.lcms.db.io.DB;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

public class MassChargeCalculator {
    private static final Logger LOGGER = LogManager.getFormatterLogger(MassChargeCalculator.class);

    /**
     * An MZSource is a handle to the result of ionic mass/charge computation.  Once m/z computation has been completed
     * for a given set of mass sources (like InChIs, arbitrary numeric values, and the results of the very convenient
     * Utils.extractMassFromString), the resulting MZSource object can be used to access all of the m/z values
     * computed from that source of a monoisotopic mass.  In turn, any peaks associated with a particular m/z can be
     * mapped back to the source from which the search window in which the peak was found was originally derived.
     */
    @JsonInclude(JsonInclude.Include.NON_NULL) // Hide unused fields since this is effectively a tagged union/sum type.
    public static class MZSource implements Serializable {
        private static final long serialVersionUID = -6359715907934400901L;

        @JsonIgnore
        private static transient AtomicInteger instanceCounter = new AtomicInteger(0);

        private enum KIND {
            INCHI, // An InChI whose mass to import.
            ARBITRARY_MASS, // Some monoisotopic mass specified without a source molecule.
            PRE_EXTRACTED_MASS, // The output of Utils.extractMassFromString, which is very handy.
            UNKNOWN, // For deserialization.
            ;
        }

        @JsonProperty(value = "kind", required = true)
        KIND kind = KIND.UNKNOWN;
        @JsonProperty(value = "inchi")
        String inchi;
        @JsonProperty(value = "arbitrary_mass")
        Double arbitraryMass;
        // Don't use Pair here, as Jackson 2.6 chokes on it.
        @JsonProperty(value = "pre_extracted_label")
        String preExtractedLabel;
        @JsonProperty(value = "pre_extracted_mass")
        Double preExtractedMass;
        /* Note: this ID is for logging/convenience only.  It is not intended as a durable or consistent pointer to a
         * particular MZSource.  The caller is responsible for understanding the true source/meaning of an MZSource
         * object; by itself, it is just a handle to some computation. */
        @JsonProperty(value = "id", required = true)
        @JsonDeserialize(using = MZSourceIDDeserializer.class)
        Integer objectId;

        /**
         * Construct a source based on an InChI.  Monoisotopic mass will automatically be calculated using MassCalculator.
         * @param inchi The inchi whose mass to use.
         */
        public MZSource(String inchi) {
            this.kind = KIND.INCHI;
            this.inchi = inchi;
            setId();
        }

        /**
         * Construct a source based on an arbitrary monoisotopic mass.  Do not use average mass for this value.
         * @param arbitraryMass Some monoisotopic mass to use as a source.
         */
        public MZSource(Double arbitraryMass) {
            this.kind = KIND.ARBITRARY_MASS;
            this.arbitraryMass = arbitraryMass;
            setId();
        }

        /**
         * Construct a source based on the result of {@link com.act.lcms.db.analysis.Utils#extractMassFromString}.
         * @param extractMassFromStringResult The results of {@link com.act.lcms.db.analysis.Utils#extractMassFromString}.
         */
        public MZSource(Pair<String, Double> extractMassFromStringResult) {
            this.kind = KIND.PRE_EXTRACTED_MASS;
            this.preExtractedLabel = extractMassFromStringResult.getLeft();
            this.preExtractedMass = extractMassFromStringResult.getRight();
            setId();
        }

        private MZSource() { // For de/serialization.

        }

        private void setId() {
            objectId = instanceCounter.getAndIncrement();
        }

        Integer getId() {
            return objectId;
        }

        // No protection specification here = package private.
        KIND getKind() {
            return kind;
        }

        String getInchi() {
            return inchi;
        }

        Double getArbitraryMass() {
            return arbitraryMass;
        }

        Pair<String, Double> getPreExtractedMass() {
            return Pair.of(preExtractedLabel, preExtractedMass);
        }

        /* Note: these methods are more complicated than they need be.  Specifically, since objectId should be unique, it's
         * really the only thing we should need to determine if two MZSource objects are equivalent.  However, given the
         * complexities of serialization, testing, etc., comprehensive implementations of these methods seem like a good
         * idea for now.
         */
        @Override
        public boolean equals(Object o) {
            if (this == o)
                return true;
            if (o == null || getClass() != o.getClass())
                return false;

            MZSource mzSource = (MZSource) o;

            if (kind != mzSource.kind)
                return false;
            if (inchi != null ? !inchi.equals(mzSource.inchi) : mzSource.inchi != null)
                return false;
            if (arbitraryMass != null ? !arbitraryMass.equals(mzSource.arbitraryMass)
                    : mzSource.arbitraryMass != null)
                return false;
            if (preExtractedMass != null ? !preExtractedMass.equals(mzSource.preExtractedMass)
                    : mzSource.preExtractedMass != null)
                return false;
            return objectId.equals(mzSource.objectId);

        }

        @Override
        public int hashCode() {
            int result = kind.hashCode();
            result = 31 * result + (inchi != null ? inchi.hashCode() : 0);
            result = 31 * result + (arbitraryMass != null ? arbitraryMass.hashCode() : 0);
            result = 31 * result + (preExtractedMass != null ? preExtractedMass.hashCode() : 0);
            result = 31 * result + objectId.hashCode();
            return result;
        }

        public static class MZSourceIDDeserializer extends JsonDeserializer<Integer> {
            @Override
            public Integer deserialize(JsonParser p, DeserializationContext ctxt)
                    throws IOException, JsonProcessingException {
                final int thisId = p.getIntValue();
                /* Set the instance counter to the current document's id + 1 if it isn't already higher.  This will (weakly)
                 * ensure that any newly created documents don't collide with whatever we're reading in.
                 *
                 * TODO: these ids are awful, and are certainly something I expect to regret implementing.  We should use
                 * something better, make ids cleanly transient, or try to get away from ids at all.
                 */
                instanceCounter.getAndUpdate(x -> Integer.max(x, thisId + 1));
                return thisId;
            }
        }

        private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
            in.defaultReadObject();
            // See comment in JSON id serializer re: why we need special id handling.
            instanceCounter.getAndUpdate(x -> Integer.max(x, getId() + 1));
        }
    }

    /**
     * A MassChargeMap is built from a list of MZSources, and stores the monoisotopic and ionic mass variants for each
     * of those sources.
     */
    public static class MassChargeMap implements Serializable {
        private static final long serialVersionUID = -332580987490663497L;

        // Tracks the monoisotopic mass for a given source, which can then be used to find collisions in the reverse map.
        Map<MZSource, Double> monoisotopicMasses;
        // Maps each monoisotopic mass to any colliding sources.
        Map<Double, List<MZSource>> reverseMonoisotopicMasses = new HashMap<>();

        // For a given ionic mass, it is a <ION_TYPE> for mass <MONOISOTOPIC MASS>, e.g., M+H for
        Map<Double, List<Pair<String, Double>>> reverseIonicMasses = new HashMap<>();

        // Package-private again.
        MassChargeMap() {
        }

        public List<MZSource> monoMassToMZSources(Double monoisotopicMass) {
            return Collections.unmodifiableList(reverseMonoisotopicMasses.get(monoisotopicMass));
        }

        public List<Pair<String, Double>> ionMZtoMonoMassAndIonName(Double mz) {
            return Collections.unmodifiableList(reverseIonicMasses.get(mz));
        }

        public Set<MZSource> ionMZToMZSources(Double mz) {
            List<Pair<String, Double>> reverseIons = reverseIonicMasses.get(mz);
            if (reverseIons == null) {
                return Collections.emptySet();
            }
            return Collections.unmodifiableSet(reverseIons.stream()
                    .map(p -> reverseMonoisotopicMasses.get(p.getRight())).flatMap(List::stream). // Flattens stream of List<MZSource> into single stream of MZSource.
                    collect(Collectors.toSet()));
        }

        /* Use Iterable -> Stream instead of Iterator, as Iterator doesn't play nicely with streams on its own.
         * Iterator + Stream.generate = java.util.NoSuchElementException = :(
         * http://stackoverflow.com/questions/24511052/how-to-convert-an-iterator-to-a-stream */
        public Iterable<MZSource> mzSourceIter() {
            return () -> monoisotopicMasses.keySet().iterator();
        }

        public Iterable<Double> monoisotopicMassIter() {
            return () -> reverseMonoisotopicMasses.keySet().iterator();
        }

        public Iterable<Double> ionMZIter() {
            return () -> reverseIonicMasses.keySet().iterator();
        }

        // Package private--this one can eat a whole lot of memory, so we prefer the streaming approaches.
        List<Double> ionMZsSorted() {
            List<Double> results = new ArrayList<>(reverseIonicMasses.keySet());
            Collections.sort(results);
            // The keySet has already unique'd all the values, so all we need to do is sort 'em and return 'em.
            return Collections.unmodifiableList(results); // Make the list unmodifiable for safety's sake (we own the masses).
        }

        // Package private again.
        void loadSourcesAndMasses(List<Pair<MZSource, Double>> sourcesAndMasses, Set<String> onlyConsiderIons)
                throws IOException {
            monoisotopicMasses = new LinkedHashMap<>(sourcesAndMasses.size()); // Preserve order for easier testing.

            LOGGER.info("Converting %d m/z sources into ionic masses and resolving collisions",
                    sourcesAndMasses.size());
            int sourceCounter = 0;
            int ionCounter = 0;
            for (Pair<MZSource, Double> sourceAndMass : sourcesAndMasses) {
                MZSource source = sourceAndMass.getLeft();
                Double monoMass = sourceAndMass.getRight();
                monoisotopicMasses.put(source, monoMass);

                sourceCounter++;
                if (sourceCounter % 1000 == 0) {
                    LOGGER.info("Resolved %d sources, handled %d ion m/z's in total", sourceCounter, ionCounter);
                }

                if (reverseMonoisotopicMasses.containsKey(monoMass)) {
                    reverseMonoisotopicMasses.get(monoMass).add(source);
                    /* And we're done!  We've already computed the ions for this monoisotopic mass, so we've just squashed a
                     * duplicate.  The source -> results correspondence will come out when we map the hits over the ionic masses,
                     * then tie those back to their corresponding MZSources. */
                    continue;
                }

                List<MZSource> sourcesWithThisMass = new ArrayList<MZSource>(1) {
                    {
                        add(source);
                    }
                };
                reverseMonoisotopicMasses.put(monoMass, sourcesWithThisMass);

                Map<String, Double> ions = MS1.getIonMasses(monoMass, MS1.IonMode.POS);
                // Filter to only the specified ions if the onlyConsider set is non-empty; allow all ions by default.
                if (onlyConsiderIons.size() != 0) {
                    ions = ions.entrySet().stream().filter(e -> onlyConsiderIons.contains(e.getKey()))
                            .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
                }

                for (Map.Entry<String, Double> ion : ions.entrySet()) {
                    List<Pair<String, Double>> reverseMapping = reverseIonicMasses.get(ion.getValue());
                    if (reverseMapping == null) {
                        reverseMapping = new ArrayList<>(1); // Assume few collisions.
                        reverseIonicMasses.put(ion.getValue(), reverseMapping);
                    }
                    reverseMapping.add(Pair.of(ion.getKey(), monoMass));
                    ionCounter++;
                }
            }
            LOGGER.info("Done resolving %d sources, found %d ion m/z's in total", sourceCounter, ionCounter);
        }

        /**
         * Accumulates all of the MZSources associated with this ion and the ion name per each source for a given ion m/z.
         * @param ionicMassCharge The ion's m/z for which to search.
         * @return A list of each corresponding MZSource and the ionic variant the specified m/z represents for this source.
         *         Returns an empty list if no matching sources are found.
         */
        public List<Pair<MZSource, String>> mapIonMZToSources(Double ionicMassCharge) {
            List<Pair<String, Double>> reverseMapping = reverseIonicMasses.get(ionicMassCharge);
            if (reverseMapping == null) {
                return Collections.emptyList();
            }

            // Flattening this list with the stream API is a bit clumsy, so just loop over all the reverse mappings.
            List<Pair<MZSource, String>> results = new ArrayList<>();
            for (Pair<String, Double> ionNameAndMonoMass : reverseMapping) {
                List<MZSource> mzSources = reverseMonoisotopicMasses.get(ionNameAndMonoMass.getRight());
                for (MZSource source : mzSources) {
                    results.add(Pair.of(source, ionNameAndMonoMass.getLeft()));
                }
            }

            return results;
        }
    }

    public static MassChargeMap makeMassChargeMap(List<MZSource> mzSources) throws IOException {
        return makeMassChargeMap(mzSources, Collections.emptySet());
    }

    public static MassChargeMap makeMassChargeMap(List<MZSource> mzSources, Set<String> onlyConsiderIons)
            throws IOException {
        MassChargeMap map = new MassChargeMap();
        /* Map over the sources, extracting or computing the mass as we go to encapsulate any unsafe behavior outside the
         * MassChargeMap constructor. */
        List<Pair<MZSource, Double>> sourcesAndMasses = new ArrayList<>();
        for (MZSource source : mzSources) {
            try {
                sourcesAndMasses.add(Pair.of(source, computeMass(source)));
            } catch (Exception e) {
                LOGGER.error("MZSource %d threw an error during mass calculation, skipping", source.getId());
            }
        }
        LOGGER.info("Consumed %d mzSources, sourcesAndMasses has %d entries", mzSources.size(),
                sourcesAndMasses.size());
        map.loadSourcesAndMasses(sourcesAndMasses, onlyConsiderIons);
        return map;
    }

    protected static Double computeMass(MZSource mzSource) {
        Double mass;
        String msg;
        switch (mzSource.getKind()) {
        case INCHI:
            Pair<Double, Integer> massAndCharge;
            try {
                massAndCharge = MassCalculator.calculateMassAndCharge(mzSource.getInchi());
            } catch (Exception e) {
                LOGGER.error("Calculating mass for molecule %s failed: %s", mzSource.getInchi(), e.getMessage());
                throw e;
            }
            if (massAndCharge.getRight() > 0) {
                LOGGER.warn(
                        "(MZSrc %d) Molecule %s has a positive charge %d; ionization may not have the expected effect",
                        mzSource.getId(), mzSource.getInchi(), massAndCharge.getRight());
            } else if (massAndCharge.getRight() < 0) {
                LOGGER.warn("(MZSrc %d) Molecule %s has a negative charge %d; it may not fly in positive ion mode",
                        mzSource.getId(), mzSource.getInchi(), massAndCharge.getRight());
            }
            mass = massAndCharge.getLeft();
            break;
        case ARBITRARY_MASS:
            mass = mzSource.getArbitraryMass();
            break;
        case PRE_EXTRACTED_MASS:
            mass = mzSource.getPreExtractedMass().getRight();
            break;
        case UNKNOWN:
            msg = String.format("mzSource with id %d has UNKNOWN kind, which is invalid", mzSource.getId());
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        default:
            msg = String.format("mzSource with id %d has unknown kind (should be impossible): %s", mzSource.getId(),
                    mzSource.getKind());
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        }
        return mass;
    }
}