com.act.lcms.db.analysis.ConfigurableAnalysis.java Source code

Java tutorial

Introduction

Here is the source code for com.act.lcms.db.analysis.ConfigurableAnalysis.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms.db.analysis;

import com.act.lcms.Gnuplotter;
import com.act.lcms.db.io.DB;
import com.act.lcms.db.io.LoadPlateCompositionIntoDB;
import com.act.utils.TSVParser;
import com.act.lcms.db.model.LCMSWell;
import com.act.lcms.db.model.Plate;
import com.act.lcms.db.model.ScanFile;
import com.act.lcms.db.model.StandardWell;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;

import java.io.File;
import java.io.FileOutputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ConfigurableAnalysis {
    public static final String OPTION_DIRECTORY = "d";
    public static final String OPTION_OUTPUT_PREFIX = "o";
    public static final String OPTION_CONFIG_FILE = "c";
    public static final String OPTION_USE_HEATMAP = "e";
    public static final String OPTION_FONT_SCALE = "s";
    public static final String OPTION_USE_SNR = "r";

    /* TODO: this format was based on a TSV sample Chris produced.  We should improve it to better match the format of the
     * data that is available in the DB. */
    public static final String HEADER_SAMPLE = "trigger/sample";
    public static final String HEADER_LABEL = "label";
    public static final String HEADER_EXACT_MASS = "exact_mass";
    public static final String HEADER_PRECISION = "precision";
    public static final String HEADER_INTENSITY_RANGE_GROUP = "normalization_group";
    public static final String[] EXPECTED_HEADER_FIELDS = new String[] { HEADER_SAMPLE, HEADER_LABEL,
            HEADER_EXACT_MASS, HEADER_PRECISION, HEADER_INTENSITY_RANGE_GROUP };

    public static final String HELP_MESSAGE = StringUtils.join(
            new String[] { "LCMS analysis class that takes as its input a configuration TSV and outputs plots ",
                    "of the plates/masses specified in that configuration file. The expected rows in the ",
                    "configuration file are based on a mockup Chris created, and look like:\n",
                    "`<plate barcode>|<well coordinates>\\t<label>\\t<mass or chemical>\\t",
                    "<resolution (only 0.01 or 0.001)>\\t<y axis range group>`\n\n", "Expected TSV fields are:\n",
                    StringUtils.join(EXPECTED_HEADER_FIELDS, ", "), "\n\n" },
            "");
    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
    static {
        HELP_FORMATTER.setWidth(100);
    }

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_DIRECTORY).argName("directory")
                    .desc("The directory where LCMS analysis results live").hasArg().required()
                    .longOpt("data-dir"));
            add(Option.builder(OPTION_OUTPUT_PREFIX).argName("output prefix")
                    .desc("A prefix for the output data/pdf files").hasArg().required().longOpt("output-prefix"));
            add(Option.builder(OPTION_CONFIG_FILE).argName("configuration file")
                    .desc("A configuration TSV that determines the layout of the analysis").hasArg().required()
                    .longOpt("config-file"));
            add(Option.builder(OPTION_USE_SNR)
                    .desc("Use signal-to-noise ratio instead of max intensity for peak identification")
                    .longOpt("use-snr"));

            add(Option.builder(OPTION_USE_HEATMAP).desc("Produce a heat map rather than a 2d line plot")
                    .longOpt("heat-map"));

            add(Option.builder(OPTION_FONT_SCALE).argName("font scale").desc(
                    "A Gnuplot fontscale value, should be between 0.1 and 0.5 (0.4 works if the graph text is large")
                    .hasArg().longOpt("font-scale"));
        }
    };
    static {
        // Add DB connection options.
        OPTION_BUILDERS.addAll(DB.DB_OPTION_BUILDERS);
    }

    public static class AnalysisStep {
        public enum KIND {
            HEADER, SEPARATOR_LARGE, SEPARATOR_SMALL, SAMPLE,
        }

        private Integer index;
        private KIND kind;
        private String plateBarcode;
        private String plateCoords;
        private String label;
        private Double exactMass;
        private Boolean useFineGrainedMZTolerance;
        private Integer intensityRangeGroup;

        public AnalysisStep(Integer index, KIND kind, String plateBarcode, String plateCoords, String label,
                Double exactMass, Boolean useFineGrainedMZTolerance, Integer intensityRangeGroup) {
            this.index = index;
            this.kind = kind;
            this.plateBarcode = plateBarcode;
            this.plateCoords = plateCoords;
            this.label = label;
            this.exactMass = exactMass;
            this.useFineGrainedMZTolerance = useFineGrainedMZTolerance;
            this.intensityRangeGroup = intensityRangeGroup;
        }

        public Integer getIndex() {
            return index;
        }

        public KIND getKind() {
            return kind;
        }

        public String getPlateBarcode() {
            return plateBarcode;
        }

        public String getPlateCoords() {
            return plateCoords;
        }

        public String getLabel() {
            return label;
        }

        public Double getExactMass() {
            return exactMass;
        }

        public Boolean getUseFineGrainedMZTolerance() {
            return useFineGrainedMZTolerance;
        }

        public Integer getIntensityRangeGroup() {
            return intensityRangeGroup;
        }
    }

    public static final Pattern SAMPLE_WELL_PATTERN = Pattern.compile("^(.*)\\|([A-Za-z]+[0-9]+)$");

    public static final String EXPECTED_LOW_PRECISION_STRING = "0.01";
    public static final String EXPECTED_HIGH_PRECISION_STRING = "0.001";

    public static AnalysisStep mapToStep(DB db, Integer index, Map<String, String> map) throws SQLException {
        String sample = map.get(HEADER_SAMPLE);
        if (sample == null || sample.isEmpty()) {
            System.err.format("Missing required field %s in line %d, skipping\n", HEADER_SAMPLE, index);
            return null;
        }
        if (sample.startsWith("<header")) {
            return new AnalysisStep(index, AnalysisStep.KIND.HEADER, null, null, map.get(HEADER_LABEL), null, null,
                    null);
        }
        if (sample.equals("<large_line_separator>")) {
            return new AnalysisStep(index, AnalysisStep.KIND.SEPARATOR_LARGE, null, null, null, null, null, null);
        }
        if (sample.equals("<small_line_separator>")) {
            return new AnalysisStep(index, AnalysisStep.KIND.SEPARATOR_SMALL, null, null, null, null, null, null);
        }

        Matcher sampleMatcher = SAMPLE_WELL_PATTERN.matcher(sample);
        if (!sampleMatcher.matches()) {
            System.err.format("Unable to interpret sample field %s in line %d, skipping\n", sample, index);
            return null;
        }
        String plateBarcode = sampleMatcher.group(1);
        String plateCoords = sampleMatcher.group(2);

        String label = map.get(HEADER_LABEL);
        Pair<String, Double> exactMass = Utils.extractMassFromString(db, map.get(HEADER_EXACT_MASS));

        Boolean useFineGrainedMZTolerance = false;
        String precisionString = map.get(HEADER_PRECISION);
        if (precisionString != null && !precisionString.isEmpty()) {
            switch (map.get(HEADER_PRECISION)) {
            case EXPECTED_LOW_PRECISION_STRING:
                useFineGrainedMZTolerance = false;
                break;
            case EXPECTED_HIGH_PRECISION_STRING:
                useFineGrainedMZTolerance = true;
                break;
            case "low":
                useFineGrainedMZTolerance = false;
                break;
            case "high":
                useFineGrainedMZTolerance = true;
                break;
            case "coarse":
                useFineGrainedMZTolerance = false;
                break;
            case "fine":
                useFineGrainedMZTolerance = true;
                break;
            default:
                System.err.format("Invalid precision value %s, defaulting to coarse-grained analysis\n",
                        precisionString);
                useFineGrainedMZTolerance = false;
                break;
            }
        }

        String intensityGroupString = map.get(HEADER_INTENSITY_RANGE_GROUP);
        Integer intensityGroup = -1;
        if (intensityGroupString != null && !intensityGroupString.isEmpty()) {
            intensityGroup = Integer.parseInt(intensityGroupString);
        }

        return new AnalysisStep(index, AnalysisStep.KIND.SAMPLE, plateBarcode, plateCoords, label,
                exactMass.getRight(), useFineGrainedMZTolerance, intensityGroup);
    }

    // TODO: do we want to make this configurable?  Or should we always just search for M?
    public static final Set<String> SEARCH_IONS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList("M+H")));
    public static final Set<String> EMPTY_SET = Collections.unmodifiableSet(new HashSet<>(0));

    public static void runAnalysis(DB db, File lcmsDir, String outputPrefix, List<AnalysisStep> steps,
            boolean makeHeatmaps, Double fontScale, boolean useSNR) throws SQLException, Exception {
        HashMap<String, Plate> platesByBarcode = new HashMap<>();
        HashMap<Integer, Plate> platesById = new HashMap<>();
        Map<Integer, Pair<List<ScanData<LCMSWell>>, Double>> lcmsResults = new HashMap<>();
        Map<Integer, Pair<List<ScanData<StandardWell>>, Double>> standardResults = new HashMap<>();
        Map<Integer, Double> intensityGroupMaximums = new HashMap<>();
        for (AnalysisStep step : steps) {
            // There's no trace analysis to perform for non-sample steps.
            if (step.getKind() != AnalysisStep.KIND.SAMPLE) {
                continue;
            }

            Plate p = platesByBarcode.get(step.getPlateBarcode());
            if (p == null) {
                p = Plate.getPlateByBarcode(db, step.getPlateBarcode());
                if (p == null) {
                    throw new IllegalArgumentException(
                            String.format("Found invalid plate barcode '%s' for analysis component %d",
                                    step.getPlateBarcode(), step.getIndex()));
                }
                platesByBarcode.put(p.getBarcode(), p);
                platesById.put(p.getId(), p);
            }

            Pair<Integer, Integer> coords = Utils.parsePlateCoordinates(step.getPlateCoords());
            List<Pair<String, Double>> searchMZs = Collections
                    .singletonList(Pair.of("Configured m/z value", step.getExactMass()));
            Double maxIntesnsity = null;
            switch (p.getContentType()) {
            case LCMS:
                // We don't know which of the scans are positive samples and which are negatives, so call them all positive.
                List<LCMSWell> lcmsSamples = Collections.singletonList(LCMSWell.getInstance()
                        .getByPlateIdAndCoordinates(db, p.getId(), coords.getLeft(), coords.getRight()));
                Pair<List<ScanData<LCMSWell>>, Double> lcmsScanData = AnalysisHelper.processScans(db, lcmsDir,
                        searchMZs, ScanData.KIND.POS_SAMPLE, platesById, lcmsSamples,
                        step.getUseFineGrainedMZTolerance(), SEARCH_IONS, EMPTY_SET, useSNR);
                lcmsResults.put(step.getIndex(), lcmsScanData);
                maxIntesnsity = lcmsScanData.getRight();
                break;
            case STANDARD:
                List<StandardWell> standardSamples = Collections
                        .singletonList(StandardWell.getInstance().getStandardWellsByPlateIdAndCoordinates(db,
                                p.getId(), coords.getLeft(), coords.getRight()));
                Pair<List<ScanData<StandardWell>>, Double> standardScanData = AnalysisHelper.processScans(db,
                        lcmsDir, searchMZs, ScanData.KIND.STANDARD, platesById, standardSamples,
                        step.getUseFineGrainedMZTolerance(), SEARCH_IONS, EMPTY_SET, useSNR);
                standardResults.put(step.getIndex(), standardScanData);
                maxIntesnsity = standardScanData.getRight();
                break;
            default:
                throw new IllegalArgumentException(
                        String.format("Invalid plate content kind %s for plate %s in analysis component %d",
                                p.getContentType(), p.getBarcode(), step.getIndex()));
            }
            Double existingMax = intensityGroupMaximums.get(step.getIntensityRangeGroup());
            if (existingMax == null || existingMax < maxIntesnsity) { // TODO: is this the right max intensity?
                intensityGroupMaximums.put(step.getIntensityRangeGroup(), maxIntesnsity);
            }
        }

        // Prep the chart labels/types, write out the data, and plot the charts.
        File dataFile = new File(outputPrefix + ".data");
        List<Gnuplotter.PlotConfiguration> plotConfigurations = new ArrayList<>(steps.size());
        int numGraphs = 0;
        try (FileOutputStream fos = new FileOutputStream(dataFile)) {
            for (AnalysisStep step : steps) {
                if (step.getKind() == AnalysisStep.KIND.HEADER) {
                    // TODO: change the Gnuplotter API to add headings and update this.
                    plotConfigurations.add(new Gnuplotter.PlotConfiguration(
                            Gnuplotter.PlotConfiguration.KIND.HEADER, step.getLabel(), null, null));
                    continue;
                }
                if (step.getKind() == AnalysisStep.KIND.SEPARATOR_LARGE
                        || step.getKind() == AnalysisStep.KIND.SEPARATOR_SMALL) {
                    // TODO: change the Gnuplotter API to add headings and update this.
                    plotConfigurations.add(new Gnuplotter.PlotConfiguration(
                            Gnuplotter.PlotConfiguration.KIND.SEPARATOR, "", null, null));
                    continue;
                }
                Plate p = platesByBarcode.get(step.getPlateBarcode());
                Double maxIntensity = intensityGroupMaximums.get(step.getIntensityRangeGroup());
                switch (p.getContentType()) {
                case LCMS:
                    Pair<List<ScanData<LCMSWell>>, Double> lcmsPair = lcmsResults.get(step.getIndex());
                    if (lcmsPair.getLeft().size() > 1) {
                        System.err.format("Found multiple scan files for LCMW well %s @ %s, using first\n",
                                step.getPlateBarcode(), step.getPlateCoords());
                    }
                    AnalysisHelper.writeScanData(fos, lcmsDir, maxIntensity, lcmsPair.getLeft().get(0),
                            makeHeatmaps, false);
                    break;
                case STANDARD:
                    Pair<List<ScanData<StandardWell>>, Double> stdPair = standardResults.get(step.getIndex());
                    if (stdPair.getLeft().size() > 1) {
                        System.err.format("Found multiple scan files for standard well %s @ %s, using first\n",
                                step.getPlateBarcode(), step.getPlateCoords());
                    }
                    AnalysisHelper.writeScanData(fos, lcmsDir, maxIntensity, stdPair.getLeft().get(0), makeHeatmaps,
                            false);
                    break;
                default:
                    // This case represents a bug, so it's a RuntimeException.
                    throw new RuntimeException(
                            String.format("Found unexpected content type %s for plate %s on analysis step %d",
                                    p.getContentType(), p.getBarcode(), step.getIndex()));
                }
                plotConfigurations.add(new Gnuplotter.PlotConfiguration(Gnuplotter.PlotConfiguration.KIND.GRAPH,
                        step.getLabel(), numGraphs, maxIntensity));
                numGraphs++;
            }

            String fmt = "pdf";
            File imgFile = new File(outputPrefix + "." + fmt);
            Gnuplotter plotter = fontScale == null ? new Gnuplotter() : new Gnuplotter(fontScale);
            if (makeHeatmaps) {
                plotter.plotHeatmap(dataFile.getAbsolutePath(), imgFile.getAbsolutePath(), fmt, null, null,
                        plotConfigurations, imgFile + ".gnuplot");
            } else {
                plotter.plot2D(dataFile.getAbsolutePath(), imgFile.getAbsolutePath(), "time", "intensity", fmt,
                        null, null, plotConfigurations, imgFile + ".gnuplot");
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                    true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                    true);
            return;
        }

        File lcmsDir = new File(cl.getOptionValue(OPTION_DIRECTORY));
        if (!lcmsDir.isDirectory()) {
            System.err.format("File at %s is not a directory\n", lcmsDir.getAbsolutePath());
            HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                    true);
            System.exit(1);
        }

        Double fontScale = null;
        if (cl.hasOption("font-scale")) {
            try {
                fontScale = Double.parseDouble(cl.getOptionValue("font-scale"));
            } catch (IllegalArgumentException e) {
                System.err.format("Argument for font-scale must be a floating point number.\n");
                System.exit(1);
            }
        }

        File configFile = new File(cl.getOptionValue(OPTION_CONFIG_FILE));
        if (!configFile.isFile()) {
            throw new IllegalArgumentException(
                    String.format("Not a regular file at %s", configFile.getAbsolutePath()));
        }
        TSVParser parser = new TSVParser();
        parser.parse(configFile);

        try (DB db = DB.openDBFromCLI(cl)) {
            System.out.format("Loading/updating LCMS scan files into DB\n");
            ScanFile.insertOrUpdateScanFilesInDirectory(db, lcmsDir);

            List<AnalysisStep> steps = new ArrayList<>(parser.getResults().size());
            int i = 0;
            for (Map<String, String> row : parser.getResults()) {
                AnalysisStep step = mapToStep(db, i, row);
                if (step != null) {
                    System.out.format("%d: %s '%s' %s %s %f %s\n", step.getIndex(), step.getKind(), step.getLabel(),
                            step.getPlateBarcode(), step.getPlateCoords(), step.getExactMass(),
                            step.getUseFineGrainedMZTolerance());
                }
                steps.add(step);
                i++;
            }

            System.out.format("Running analysis\n");
            runAnalysis(db, lcmsDir, cl.getOptionValue(OPTION_OUTPUT_PREFIX), steps,
                    cl.hasOption(OPTION_USE_HEATMAP), fontScale, cl.hasOption(OPTION_USE_SNR));
        }
    }
}