com.act.lcms.v2.MZCollisionCounter.java Source code

Java tutorial

Introduction

Here is the source code for com.act.lcms.v2.MZCollisionCounter.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms.v2;

import com.act.utils.CLIUtil;
import com.act.utils.TSVWriter;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

public class MZCollisionCounter {
    private static final Logger LOGGER = LogManager.getFormatterLogger(MZCollisionCounter.class);

    private static final String OPTION_INPUT_INCHI_LIST = "i";
    private static final String OPTION_OUTPUT_FILE = "o";
    private static final String OPTION_COUNT_WINDOW_INTERSECTIONS = "w";
    private static final String OPTION_WINDOW_HALFWIDTH = "s";
    private static final String OPTION_ONLY_CONSIDER_IONS = "n";
    private static final Double DEFAULT_WINDOW_TOLERANCE = 0.01;

    private static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class calculates the monoisoptic masses of a list of InChIs, computes ion m/z's for those masses, ",
            "and computes the distribution of m/z collisions over these m/z's, producing a histogram as its output." },
            "");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_INPUT_INCHI_LIST).argName("input-file")
                    .desc("An input file containing just InChIs").hasArg().required().longOpt("input-file"));
            add(Option.builder(OPTION_OUTPUT_FILE).argName("output-file")
                    .desc("Write collision histogram data to an output file").hasArg().required()
                    .longOpt("output-file"));
            add(Option.builder(OPTION_COUNT_WINDOW_INTERSECTIONS).desc(
                    "Count intersections of +/-0.01 Dalton mass charge windows instead of counting exact collisions; "
                            + "counts the number of structures that might fall within each window and bins by count")
                    .longOpt("window-collisions"));
            add(Option.builder(OPTION_WINDOW_HALFWIDTH)
                    .desc(String.format(
                            "Sets the window half-width for collision counting in window mode, default is %.3f",
                            DEFAULT_WINDOW_TOLERANCE))
                    .longOpt("window-half-width"));
            add(Option.builder(OPTION_ONLY_CONSIDER_IONS).argName("ions")
                    .desc("Only consider these ions when computing mass/charges (comma separated list)").hasArgs()
                    .valueSeparator(',').longOpt("only-ions"));
        }
    };

    public static void main(String[] args) throws Exception {
        CLIUtil cliUtil = new CLIUtil(MassChargeCalculator.class, HELP_MESSAGE, OPTION_BUILDERS);
        CommandLine cl = cliUtil.parseCommandLine(args);

        File inputFile = new File(cl.getOptionValue(OPTION_INPUT_INCHI_LIST));
        if (!inputFile.exists()) {
            cliUtil.failWithMessage("Input file at does not exist at %s", inputFile.getAbsolutePath());
        }

        List<MassChargeCalculator.MZSource> sources = new ArrayList<>();
        try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
            String line;
            while ((line = reader.readLine()) != null) {
                line = line.trim();
                sources.add(new MassChargeCalculator.MZSource(line));
                if (sources.size() % 1000 == 0) {
                    LOGGER.info("Loaded %d sources from input file", sources.size());
                }
            }
        }

        Set<String> considerIons = Collections.emptySet();
        if (cl.hasOption(OPTION_ONLY_CONSIDER_IONS)) {
            List<String> ions = Arrays.asList(cl.getOptionValues(OPTION_ONLY_CONSIDER_IONS));
            LOGGER.info("Only considering ions for m/z calculation: %s", StringUtils.join(ions, ", "));
            considerIons = new HashSet<>(ions);
        }

        TSVWriter<String, Long> tsvWriter = new TSVWriter<>(Arrays.asList("collisions", "count"));
        tsvWriter.open(new File(cl.getOptionValue(OPTION_OUTPUT_FILE)));

        try {
            LOGGER.info("Loaded %d sources in total from input file", sources.size());

            MassChargeCalculator.MassChargeMap mzMap = MassChargeCalculator.makeMassChargeMap(sources,
                    considerIons);

            if (!cl.hasOption(OPTION_COUNT_WINDOW_INTERSECTIONS)) {
                // Do an exact analysis of the m/z collisions if windowing is not specified.

                LOGGER.info("Computing precise collision histogram.");
                Iterable<Double> mzs = mzMap.ionMZIter();
                Map<Integer, Long> collisionHistogram = histogram(
                        StreamSupport.stream(mzs.spliterator(), false).map(mz -> { // See comment about Iterable below.
                            try {
                                return mzMap.ionMZToMZSources(mz).size();
                            } catch (NoSuchElementException e) {
                                LOGGER.error("Caught no such element exception for mz %f: %s", mz, e.getMessage());
                                throw e;
                            }
                        }));
                List<Integer> sortedCollisions = new ArrayList<>(collisionHistogram.keySet());
                Collections.sort(sortedCollisions);
                for (Integer collision : sortedCollisions) {
                    tsvWriter.append(new HashMap<String, Long>() {
                        {
                            put("collisions", collision.longValue());
                            put("count", collisionHistogram.get(collision));
                        }
                    });
                }
            } else {
                /* After some deliberation (thanks Gil!), the windowed variant of this calculation counts the number of
                 * structures whose 0.01 Da m/z windows (for some set of ions) overlap with each other.
                 *
                 * For example, let's assume we have five total input structures, and are only searching for one ion.  Let's
                 * also assume that three of those structures have m/z A and the remaining two have m/z B.  The windows might
                 * look like this in the m/z domain:
                 * |----A----|
                 *        |----B----|
                 * Because A represents three structures and overlaps with B, which represents two, we assign A a count of 5--
                 * this is the number of structures we believe could fall into the range of A given our current peak calling
                 * approach.  Similarly, B is assigned a count of 5, as the possibility for collision/confusion is symmetric.
                 *
                 * Note that this is an over-approximation of collisions, as we could more precisely only consider intersections
                 * when the exact m/z of B falls within the window around A and vice versa.  However, because we have observed
                 * cases where the MS sensor doesn't report structures at exactly the m/z we predict, we employ this weaker
                 * definition of intersection to give a slightly pessimistic view of what confusions might be possible. */
                // Compute windows for every m/z.  We don't care about the original mz values since we just want the count.
                List<Double> mzs = mzMap.ionMZsSorted();

                final Double windowHalfWidth;
                if (cl.hasOption(OPTION_WINDOW_HALFWIDTH)) {
                    // Don't use get with default for this option, as we want the exact FP value of the default tolerance.
                    windowHalfWidth = Double.valueOf(cl.getOptionValue(OPTION_WINDOW_HALFWIDTH));
                } else {
                    windowHalfWidth = DEFAULT_WINDOW_TOLERANCE;
                }

                /* Window = (lower bound, upper bound), counter of represented m/z's that collide with this window, and number
                 * of representative structures (which will be used in counting collisions). */
                LinkedList<CollisionWindow> allWindows = new LinkedList<CollisionWindow>() {
                    {
                        for (Double mz : mzs) {
                            // CPU for memory trade-off: don't re-compute the window bounds over and over and over and over and over.
                            try {
                                add(new CollisionWindow(mz, windowHalfWidth, mzMap.ionMZToMZSources(mz).size()));
                            } catch (NoSuchElementException e) {
                                LOGGER.error("Caught no such element exception for mz %f: %s", mz, e.getMessage());
                                throw e;
                            }
                        }
                    }
                };

                // Sweep line time!  The window ranges are the interesting points.  We just accumulate overlap counts as we go.
                LinkedList<CollisionWindow> workingSet = new LinkedList<>();
                List<CollisionWindow> finished = new LinkedList<>();

                while (allWindows.size() > 0) {
                    CollisionWindow thisWindow = allWindows.pop();
                    // Remove any windows from the working set that don't overlap with the next window.
                    while (workingSet.size() > 0 && workingSet.peekFirst().getMaxMZ() < thisWindow.getMinMZ()) {
                        finished.add(workingSet.pop());
                    }

                    for (CollisionWindow w : workingSet) {
                        /* Add the size of the new overlapping window's structure count to each of the windows in the working set,
                         * which represents the number of possible confused structures that fall within the overlapping region.
                         * We exclude the window itself as it should already have counted the colliding structures it represents. */
                        w.getAccumulator().add(thisWindow.getStructureCount());

                        /* Reciprocally, add the structure counts of all windows with which the current window overlaps to it. */
                        thisWindow.getAccumulator().add(w.getStructureCount());
                    }

                    // Now that accumulation is complete, we can safely add the current window.
                    workingSet.add(thisWindow);
                }

                // All the interesting events are done, so drop the remaining windows into the finished set.
                finished.addAll(workingSet);

                Map<Long, Long> collisionHistogram = histogram(
                        finished.stream().map(w -> w.getAccumulator().longValue()));
                List<Long> sortedCollisions = new ArrayList<>(collisionHistogram.keySet());
                Collections.sort(sortedCollisions);
                for (Long collision : sortedCollisions) {
                    tsvWriter.append(new HashMap<String, Long>() {
                        {
                            put("collisions", collision);
                            put("count", collisionHistogram.get(collision));
                        }
                    });
                }
            }
        } finally {
            if (tsvWriter != null) {
                tsvWriter.close();
            }
        }
    }

    private static class CollisionWindow {
        Double min;
        Double max;
        LongAdder accumulator = new LongAdder();
        Integer structureCount;

        public CollisionWindow(Double mz, Double windowHalfWidth, Integer structureCount) {
            this.min = mz - windowHalfWidth;
            this.max = mz + windowHalfWidth;
            this.structureCount = structureCount;

            // Set the base
            this.accumulator.add(structureCount);
        }

        Double getMinMZ() {
            return min;
        }

        Double getMaxMZ() {
            return max;
        }

        LongAdder getAccumulator() {
            return accumulator;
        }

        Integer getStructureCount() {
            return structureCount;
        }
    }

    public static <T> Map<T, Long> histogram(Stream<T> stream) {
        Map<T, Long> hist = new HashMap<>();
        // This could be done with reduce (fold) or collector cleverness, but this invocation makes the intention clear.
        //stream.forEach(x -> hist.merge(x, 1l, (acc, one) -> one + acc));
        stream.forEach(x -> {
            try {
                hist.put(x, hist.getOrDefault(x, 0l) + 1);
            } catch (NoSuchElementException e) {
                LOGGER.error("Caught no such element exception for %s: %s", x.toString(), e.getMessage());
                throw e;
            }
        });

        return hist;
    }
}