com.act.biointerpretation.l2expansion.L2FilteringDriver.java Source code

Introduction

Here is the source code for com.act.biointerpretation.l2expansion.L2FilteringDriver.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.l2expansion;

import act.server.MongoDB;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;

public class L2FilteringDriver {

    private static final Logger LOGGER = LogManager.getFormatterLogger(L2FilteringDriver.class);

    private static final String OPTION_INPUT_CORPUS = "i";
    private static final String OPTION_OUTPUT_PATH = "o";
    private static final String OPTION_CHEMICAL_FILTER = "c";
    private static final String OPTION_REACTION_FILTER = "r";
    private static final String OPTION_DB_LOOKUP = "d";
    private static final String OPTION_LOOKUP_TYPES = "L";
    private static final String OPTION_SPLIT_BY_RO = "s";
    private static final String OPTION_FILTER_SUBSTRATES = "S";
    private static final String OPTION_HELP = "h";

    private static final String APPLY_FILTER_POSITIVE = "1";
    private static final String APPLY_FILTER_NEGATED = "0";
    private static final String LOOKUP_REACTIONS = "r";
    private static final String LOOKUP_CHEMICALS = "c";

    public static final String HELP_MESSAGE = "This class is used to filter an L2PredictionCorpus. An initial corpus is read in from file, processed based on"
            + "the selected options, and then the result is printed in json format.";

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_INPUT_CORPUS).argName("input corpus path")
                    .desc("The absolute path to the input prediction corpus.").hasArg().longOpt("input-corpus-path")
                    .required(true));
            add(Option.builder(OPTION_OUTPUT_PATH).argName("output path")
                    .desc("The path to which to write the output.").hasArg().longOpt("output-path").required(true));
            add(Option.builder(OPTION_CHEMICAL_FILTER).argName("chemical db filter")
                    .desc("Use the chemical filter.  Input the value " + APPLY_FILTER_POSITIVE
                            + " to keep predictions whose " + "chemicals were all found in the DB, or "
                            + APPLY_FILTER_NEGATED + " to keep those whose chemicals "
                            + "were not all found. This step must either be run on a corpus that already has chemical DB info, or "
                            + "supplied in conjunction with the db-lookup option to populate the chemical info fields before filtering.")
                    .hasArg().longOpt("chemical-db-filter"));
            add(Option.builder(OPTION_REACTION_FILTER).argName("reaction db filter")
                    .desc("Use the reaction filter.  Input the value " + APPLY_FILTER_POSITIVE
                            + " to keep predictions which " + "match a reaction in the DB, or "
                            + APPLY_FILTER_NEGATED + " to keep those which don't. This step must "
                            + "either be run on a corpus that already has reaction DB info, supplied in conjunction with the db-lookup "
                            + "option to populate the reaction info fields before filtering.")
                    .hasArg().longOpt("reaction-db-filter"));
            add(Option.builder(OPTION_DB_LOOKUP).argName("db name").desc(
                    "Mongo DB to use for lookups; needed only if population of chemical and reaction DB info is desired..")
                    .hasArg().longOpt("db-name"));
            add(Option.builder(OPTION_LOOKUP_TYPES).argName("db lookup types")
                    .desc("This argument specifies which lookup types to use. Use " + LOOKUP_CHEMICALS
                            + " for chemical lookups, " + LOOKUP_REACTIONS
                            + " for reaction lookups, or both. These lookups compare the predictions against our DB "
                            + "and populate the chemical and reaction fields of the L2Predictions accordingly.")
                    .hasArgs().valueSeparator(',').longOpt("db-lookup-types"));
            add(Option.builder(OPTION_SPLIT_BY_RO).argName("split by ro").desc(
                    "If this argument is selected, the input corpus is read in, split up by ro, and written out into a "
                            + "different output file for each ro found in the corpus. The files have the ro id appended to the end of "
                            + "their names to distinguish them.")
                    .longOpt("split-by-ro"));
            add(Option.builder(OPTION_FILTER_SUBSTRATES).argName("filter substrates path").desc(
                    "If this argument is selected, a list of substrates to keep is fed in, and the corpus is filtered "
                            + "to preserve only predictions with substrates among that list.")
                    .hasArg().longOpt("filter-substrates"));
            add(Option.builder(OPTION_HELP).argName("help").desc("Prints this help message.").longOpt("help"));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    private static final Predicate<L2Prediction> ALL_CHEMICALS_IN_DB = prediction -> prediction.getProductIds()
            .size() == prediction.getProducts().size()
            && prediction.getSubstrateIds().size() == prediction.getSubstrates().size();

    private static final Predicate<L2Prediction> REACTION_MATCHES_DB = prediction -> prediction
            .getReactionCount() > 0;

    public static void main(String[] args) throws Exception {

        // Build command line parser.
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            LOGGER.error("Argument parsing failed: %s", e.getMessage());
            HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        // Print help.
        if (cl.hasOption(OPTION_HELP)) {
            HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        checkFilterOptionIsValid(OPTION_CHEMICAL_FILTER, cl);
        checkFilterOptionIsValid(OPTION_REACTION_FILTER, cl);

        // Get corpus files.
        File corpusFile = new File(cl.getOptionValue(OPTION_INPUT_CORPUS));
        if (!corpusFile.exists()) {
            LOGGER.error("Input corpus file does not exist.");
            return;
        }

        File outputFile = new File(cl.getOptionValue(OPTION_OUTPUT_PATH));
        outputFile.createNewFile();
        if (outputFile.isDirectory()) {
            LOGGER.error("Output file is directory.");
            System.exit(1);
        }

        LOGGER.info("Reading corpus from file.");
        L2PredictionCorpus predictionCorpus = L2PredictionCorpus.readPredictionsFromJsonFile(corpusFile);
        LOGGER.info("Read in corpus with %d predictions.", predictionCorpus.getCorpus().size());
        LOGGER.info("Corpus has %d distinct substrates.", predictionCorpus.getUniqueSubstrateInchis().size());

        if (cl.hasOption(OPTION_FILTER_SUBSTRATES)) {
            LOGGER.info("Filtering by substrates.");
            File substratesFile = new File(cl.getOptionValue(OPTION_FILTER_SUBSTRATES));
            L2InchiCorpus inchis = new L2InchiCorpus();
            inchis.loadCorpus(substratesFile);
            Set<String> inchiSet = new HashSet<String>();
            inchiSet.addAll(inchis.getInchiList());

            predictionCorpus = predictionCorpus
                    .applyFilter(prediction -> inchiSet.containsAll(prediction.getSubstrateInchis()));

            predictionCorpus.writePredictionsToJsonFile(outputFile);
            LOGGER.info("Done writing filtered corpus to file.");
            return;
        }

        if (cl.hasOption(OPTION_SPLIT_BY_RO)) {
            LOGGER.info("Splitting corpus into distinct corpuses for each ro.");
            Map<String, L2PredictionCorpus> corpusMap = predictionCorpus
                    .splitCorpus(prediction -> prediction.getProjectorName());

            for (Map.Entry<String, L2PredictionCorpus> entry : corpusMap.entrySet()) {
                String fileName = cl.getOptionValue(OPTION_OUTPUT_PATH) + "." + entry.getKey();
                File oneOutputFile = new File(fileName);
                entry.getValue().writePredictionsToJsonFile(oneOutputFile);
            }
            LOGGER.info("Done writing split corpuses to file.");
            return;
        }

        predictionCorpus = runDbLookups(cl, predictionCorpus, opts);

        LOGGER.info("Applying filters.");
        predictionCorpus = applyFilter(predictionCorpus, ALL_CHEMICALS_IN_DB, cl, OPTION_CHEMICAL_FILTER);
        predictionCorpus = applyFilter(predictionCorpus, REACTION_MATCHES_DB, cl, OPTION_REACTION_FILTER);
        LOGGER.info("Filtered corpus has %d predictions.", predictionCorpus.getCorpus().size());

        LOGGER.info("Printing final corpus.");
        predictionCorpus.writePredictionsToJsonFile(outputFile);

        LOGGER.info("L2FilteringDriver complete!.");
    }

    private static L2PredictionCorpus runDbLookups(CommandLine cl, L2PredictionCorpus predictionCorpus,
            Options opts) throws IOException {

        if (cl.hasOption(OPTION_DB_LOOKUP)) {

            if (cl.hasOption(OPTION_LOOKUP_TYPES)) {

                LOGGER.info("Instantiating mongoDB.");
                MongoDB mongoDB = new MongoDB("localhost", 27017, cl.getOptionValue(OPTION_DB_LOOKUP));

                String[] lookupOptions = cl.getOptionValues(OPTION_LOOKUP_TYPES);
                Set<String> lookupSet = new HashSet<>();
                for (String option : lookupOptions) {
                    if (!option.equals(LOOKUP_CHEMICALS) && !option.equals(LOOKUP_REACTIONS)) {
                        LOGGER.error("Invalid lookup option supplied: %s", option);
                        HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts,
                                null, true);
                        System.exit(1);
                    }
                    lookupSet.add(option);
                }

                if (lookupSet.contains(LOOKUP_CHEMICALS)) {
                    LOGGER.info("Looking up chemicals in DB.");
                    predictionCorpus = predictionCorpus.applyTransformation(new ChemicalsTransformer(mongoDB));
                }
                if (lookupSet.contains(LOOKUP_REACTIONS)) {
                    LOGGER.info("Looking up reactions in DB.");
                    predictionCorpus = predictionCorpus.applyTransformation(new ReactionsTransformer(mongoDB));
                }

            } else {
                LOGGER.warn("Mongo DB instantiated but lookup option not selected.");
            }
        }
        return predictionCorpus;
    }

    private static void checkFilterOptionIsValid(String filterOption, CommandLine cl) {
        if (cl.hasOption(filterOption)) {
            if (cl.getOptionValue(filterOption).equals(APPLY_FILTER_POSITIVE)
                    || cl.getOptionValue(filterOption).equals(APPLY_FILTER_NEGATED)) {
                return;
            } else {
                LOGGER.error("Option %s value not valid.  Must receive value %s or %s", filterOption,
                        APPLY_FILTER_POSITIVE, APPLY_FILTER_NEGATED);
                throw new IllegalArgumentException("Command line value invalid.");
            }
        }
    }

    private static L2PredictionCorpus applyFilter(L2PredictionCorpus corpus, Predicate<L2Prediction> filter,
            CommandLine cl, String filterOption) throws IOException {
        if (cl.hasOption(filterOption)) {
            if (cl.getOptionValue(filterOption).equals(APPLY_FILTER_NEGATED)) {
                return corpus.applyFilter(filter.negate());
            }
            return corpus.applyFilter(filter);
        }
        return corpus;
    }
}