com.act.biointerpretation.l2expansion.L2ExpansionDriver.java Source code

Introduction

Here is the source code for com.act.biointerpretation.l2expansion.L2ExpansionDriver.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.l2expansion;

import act.server.MongoDB;
import act.shared.Chemical;
import chemaxon.struc.Molecule;
import com.act.biointerpretation.Utils.ReactionProjector;
import com.act.biointerpretation.mechanisminspection.ErosCorpus;
import com.act.biointerpretation.sars.SarCorpus;
import com.act.jobs.FileChecker;
import com.act.jobs.JavaRunnable;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

/**
 * Runs L2 Expansion
 */
public class L2ExpansionDriver {

    private static final Logger LOGGER = LogManager.getFormatterLogger(L2ExpansionDriver.class);

    private static Integer NO_MASS_THRESHOLD = Integer.MAX_VALUE;

    private static final String OPTION_METABOLITES = "m";
    private static final String OPTION_MASS_THRESHOLD = "M";
    private static final String OPTION_RO_CORPUS = "c";
    private static final String OPTION_RO_IDS = "r";
    private static final String OPTION_SAR_CORPUS = "s";
    private static final String OPTION_OUTPUT_PATH = "o";
    private static final String OPTION_PROGRESS_PATH = "p";
    private static final String OPTION_DB = "db";
    private static final String OPTION_EXPANSION_TYPE = "t";
    private static final String OPTION_ADDITIONAL_CHEMICALS = "p";
    private static final String OPTION_HELP = "h";

    public static final String HELP_MESSAGE = "This class is used to carry out L2 expansion. It first applies every RO from the input RO list to "
            + "every metabolite in the input metabolite list.  Example input lists can be found on the NAS at "
            + "MNT_SHARED_DATA/Gil/resources. This creates a list of predicted reactions, which are augmented "
            + "with chemical ids and names, as well as reaction ids from the database. At the end of the run, "
            + "the predictions are printed to a json file.";

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_METABOLITES).argName("metabolites path name")
                    .desc("The absolute path to the metabolites file.").hasArg().longOpt("metabolite-file")
                    .required(true));
            add(Option.builder(OPTION_MASS_THRESHOLD).argName("mass threshold").desc(
                    "The maximum mass of a substrate, in daltons. Substrates with higher mass will be discarded.")
                    .hasArg().longOpt("mass-threshold").type(Integer.class));
            add(Option.builder(OPTION_RO_CORPUS).argName("ro corpus").desc(
                    "The path to the file containing the eros corpus, if not the validation corpus. Ignored if "
                            + "running a SAR expansion.")
                    .hasArg().longOpt("ro-corpus"));
            add(Option.builder(OPTION_RO_IDS).argName("ro ids path name")
                    .desc("The path to a file containing the RO ids to use. If this option is omitted, "
                            + "all ROs in the corpus are used. Ignored if running a SAR expansion.")
                    .hasArg().longOpt("ro-file"));
            add(Option.builder(OPTION_SAR_CORPUS).argName("sar corpus").desc(
                    "The path to a file containing the sar corpus to use. Ignored if running an RO-only expansion.")
                    .hasArg().longOpt("sar-corpus"));
            add(Option.builder(OPTION_RO_IDS).argName("ro ids")
                    .desc("The absolute path to the file containing the RO ids to use. If this option is omitted, "
                            + "all ROs in the corpus are used.")
                    .hasArg().longOpt("ro-ids"));
            add(Option.builder(OPTION_OUTPUT_PATH).argName("output file path")
                    .desc("The path to the file to which to write the json file of predicted reactions.").hasArg()
                    .longOpt("output-file-path").required(true));
            add(Option.builder(OPTION_PROGRESS_PATH).argName("progress file path").desc(
                    "The path to the file to which to write the json file of predicted reactions as each projection runs.")
                    .hasArg().longOpt("progress-file-path"));
            add(Option.builder(OPTION_DB).argName("db name").desc("The name of the mongo DB to use.").hasArg()
                    .longOpt("db-name"));
            add(Option.builder(OPTION_EXPANSION_TYPE).argName("type of expansion").desc(
                    "Type can take values: {ONE_SUB, TWO_SUB, SAR}.  ONE_SUB and TWO_SUB operate with only ROs, on one "
                            + "and two substrates, respectively, using only ROs. SAR runs an expansion from a SarCorpus, which "
                            + "still applies ROs but additionally constrains the substrates of each RO based on the supplied SARs.")
                    .hasArg().longOpt("expansion-type").required(true));
            add(Option.builder(OPTION_ADDITIONAL_CHEMICALS).argName("additional chemicals path name")
                    .desc("The absolute path to the additional chemicals file.").hasArg()
                    .longOpt("additional-chemicals-file"));
            add(Option.builder(OPTION_HELP).argName("help").desc("Prints this help message.").longOpt("help"));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    public static final String LOCAL_HOST = "localhost";
    public static final Integer PORT_NUMBER = 27017;

    public enum ExpansionType {
        ONE_SUB, TWO_SUB, SAR,
    }

    public static void main(String[] args) throws Exception {

        // Build command line parser.
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            LOGGER.error("Argument parsing failed: %s", e.getMessage());
            HELP_FORMATTER.printHelp(L2ExpansionDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        // Print help.
        if (cl.hasOption(OPTION_HELP)) {
            HELP_FORMATTER.printHelp(L2ExpansionDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        // Get output files.
        String outputPath = cl.getOptionValue(OPTION_OUTPUT_PATH);
        File outputFile = new File(outputPath);
        if (outputFile.isDirectory() || outputFile.exists()) {
            LOGGER.error("Supplied output file is a directory or already exists.");
            System.exit(1);
        }
        outputFile.createNewFile();
        File inchiOutputFile = new File(outputPath + ".inchis");
        if (inchiOutputFile.isDirectory() || inchiOutputFile.exists()) {
            LOGGER.error("Supplied inchi output file is a directory or already exists.");
            System.exit(1);
        }
        inchiOutputFile.createNewFile();

        Optional<OutputStream> maybeProgressStream = Optional.empty();
        if (cl.hasOption(OPTION_PROGRESS_PATH)) {
            String progressPath = cl.getOptionValue(OPTION_PROGRESS_PATH);
            File progressFile = new File(progressPath);
            LOGGER.info("Writing incremental results to file at %s", progressFile.getAbsolutePath());
            if (progressFile.isDirectory() || progressFile.exists()) {
                LOGGER.error("Supplied progress file is a directory or already exists.");
                System.exit(1);
            }
            maybeProgressStream = Optional.of(new FileOutputStream(progressFile));
        }

        // Get metabolite list
        L2InchiCorpus inchiCorpus = getInchiCorpus(cl, OPTION_METABOLITES);
        LOGGER.info("%d substrate inchis.", inchiCorpus.getInchiList().size());

        Integer maxMass = NO_MASS_THRESHOLD;
        if (cl.hasOption(OPTION_MASS_THRESHOLD)) {
            maxMass = Integer.parseInt(cl.getOptionValue(OPTION_MASS_THRESHOLD));
            LOGGER.info("Filtering out substrates with mass more than %d daltons.", maxMass);
        }
        inchiCorpus.filterByMass(maxMass);
        LOGGER.info("%d substrate inchis that are importable as molecules.", inchiCorpus.getInchiList().size());

        PredictionGenerator generator = new AllPredictionsGenerator(new ReactionProjector());

        L2Expander expander = buildExpander(cl, inchiCorpus, generator);
        L2PredictionCorpus predictionCorpus = expander.getPredictions(maybeProgressStream);

        LOGGER.info("Done with L2 expansion. Produced %d predictions.", predictionCorpus.getCorpus().size());

        LOGGER.info("Writing corpus to file.");
        predictionCorpus.writePredictionsToJsonFile(outputFile);
        L2InchiCorpus productInchis = new L2InchiCorpus(predictionCorpus.getUniqueProductInchis());
        productInchis.writeToFile(inchiOutputFile);
        LOGGER.info("L2ExpansionDriver complete!");
    }

    private static L2Expander buildExpander(CommandLine cl, L2InchiCorpus inchiCorpus,
            PredictionGenerator generator) throws IOException {

        ExpansionType expansionType = ExpansionType.valueOf(cl.getOptionValue(OPTION_EXPANSION_TYPE));

        switch (expansionType) {
        case ONE_SUB:
            LOGGER.info("Running one substrate expansion");
            return new SingleSubstrateRoExpander(getRoCorpus(cl), inchiCorpus.getMolecules(), generator);

        case TWO_SUB:
            LOGGER.info("Running two substrate expansion.");
            LOGGER.warn("This functionality is still experimental as it is not currently tested.");
            if (!cl.hasOption(OPTION_ADDITIONAL_CHEMICALS)) {
                LOGGER.error("Must supply additional chemicals file for two substrate expansion.");
                System.exit(1);
            }
            MongoDB mongoDB = new MongoDB(LOCAL_HOST, PORT_NUMBER, cl.getOptionValue(OPTION_DB)); // Start mongo instance.
            L2InchiCorpus chemicalInchis = getInchiCorpus(cl, OPTION_ADDITIONAL_CHEMICALS);
            List<Chemical> chemicalsOfInterest = L2ExpansionDriver
                    .convertListOfInchisToMolecules(chemicalInchis.getInchiList(), mongoDB);
            List<Chemical> metaboliteChemicals = L2ExpansionDriver
                    .convertListOfInchisToMolecules(inchiCorpus.getInchiList(), mongoDB);
            return new TwoSubstrateRoExpander(chemicalsOfInterest, metaboliteChemicals, getRoCorpus(cl), generator);

        case SAR:
            LOGGER.info("Running sar-based expansion.");
            File sarCorpusFile = new File(cl.getOptionValue(OPTION_SAR_CORPUS));
            if (!sarCorpusFile.exists() || sarCorpusFile.isDirectory()) {
                LOGGER.error("Sar corpus is not a valid file.");
                System.exit(1);
            }
            SarCorpus sarCorpus = SarCorpus.readCorpusFromJsonFile(sarCorpusFile);
            return new SingleSubstrateSarExpander(sarCorpus, inchiCorpus.getMolecules(), generator);

        default:
            throw new IllegalArgumentException("Invalid expansion type.");
        }
    }

    private static ErosCorpus getRoCorpus(CommandLine cl) throws IOException {
        ErosCorpus eroCorpus = new ErosCorpus();
        if (cl.hasOption(OPTION_RO_CORPUS)) {
            File roCorpusFile = new File(cl.getOptionValue(OPTION_RO_CORPUS));

            if (!roCorpusFile.exists()) {
                LOGGER.error("Ro corpus file does not exist.");
                System.exit(1);
            }
            FileInputStream roInputStream = new FileInputStream(roCorpusFile);
            eroCorpus.loadCorpus(roInputStream);
        } else {
            eroCorpus.loadValidationCorpus();
        }

        if (cl.hasOption(OPTION_RO_IDS)) {
            LOGGER.info("Filtering corpus by RO list from rosFile.");
            File roIdsFile = new File(cl.getOptionValue(OPTION_RO_IDS));

            if (!roIdsFile.exists()) {
                LOGGER.error("Ro ids file does not exist.");
                System.exit(1);
            }

            eroCorpus.filterCorpusByIdFile(roIdsFile);
        } else {
            LOGGER.info("Leaving all ROs in corpus.");
        }

        return eroCorpus;
    }

    /**
     * Gets a list of inchis for a command line option that points to a file with one inchi per line.
     *
     * @param cl Command line parser.
     * @param optionForFileName Option for a file with one inchi per line. Either the metabolite list or addition
     * chemical list.
     * @return The list of inchis contained in the file.
     * @throws IOException
     */
    private static L2InchiCorpus getInchiCorpus(CommandLine cl, String optionForFileName) throws IOException {
        File inchisFile = new File(cl.getOptionValue(optionForFileName));
        LOGGER.info("Getting inchi list from %s", inchisFile);
        L2InchiCorpus inchiCorpus = new L2InchiCorpus();
        inchiCorpus.loadCorpus(inchisFile);
        return inchiCorpus;
    }

    /**
     * This function constructs a mapping between inchi and its chemical representation.
     *
     * @param inchis A list of inchis
     * @param mongoDB The db from which to get the chemical entry
     * @return A map of inchi to chemical
     */
    private static List<Chemical> convertListOfInchisToMolecules(List<String> inchis, MongoDB mongoDB) {
        List<Chemical> result = new ArrayList<>();
        for (String inchi : inchis) {
            result.add(mongoDB.getChemicalFromInChI(inchi));
        }
        return result;
    }

    /**
     * Wraps L2 expansion so that it can be used in a workflow. The inputs are a list of RO IDs to expand on,
     * a file containing the substrates to apply the ROs to, and a file to which to write the output prediction corpus.
     *
     * @param roIds
     * @param substrateListFile
     * @param outputFile
     * @return
     */
    public static JavaRunnable getRunnableOneSubstrateRoExpander(List<Integer> roIds, File substrateListFile,
            File outputFile) {
        return new JavaRunnable() {
            @Override
            public void run() throws IOException {
                // Verify files
                FileChecker.verifyInputFile(substrateListFile);
                FileChecker.verifyAndCreateOutputFile(outputFile);

                // Handle input ros
                ErosCorpus roCorpus = new ErosCorpus();
                roCorpus.loadValidationCorpus();
                roCorpus.filterCorpusById(roIds);

                // Handle input substrates
                L2InchiCorpus inchis = new L2InchiCorpus();
                inchis.loadCorpus(substrateListFile);
                List<Molecule> moleculeList = inchis.getMolecules();

                // Build expander
                PredictionGenerator generator = new AllPredictionsGenerator(new ReactionProjector());
                L2Expander expander = new SingleSubstrateRoExpander(roCorpus, moleculeList, generator);

                // Run expander
                L2PredictionCorpus predictions = expander.getPredictions();

                // Write output
                predictions.writePredictionsToJsonFile(outputFile);
            }

            @Override
            public String toString() {
                return "oneSubstrateRoExpander:" + roIds.toString();
            }
        };
    }
}