com.act.biointerpretation.sarinference.LibMcsClustering.java Source code

Java tutorial

Introduction

Here is the source code for com.act.biointerpretation.sarinference.LibMcsClustering.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.sarinference;

import chemaxon.clustering.LibraryMCS;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.struc.Molecule;
import com.act.biointerpretation.l2expansion.L2FilteringDriver;
import com.act.biointerpretation.l2expansion.L2Prediction;
import com.act.biointerpretation.l2expansion.L2PredictionCorpus;
import com.act.jobs.FileChecker;
import com.act.jobs.JavaRunnable;
import com.act.lcms.db.io.report.IonAnalysisInterchangeModel;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

public class LibMcsClustering {

    private static final Logger LOGGER = LogManager.getFormatterLogger(LibMcsClustering.class);

    private static final String OPTION_PREDICTION_CORPUS = "c";
    private static final String OPTION_POSITIVE_INCHIS = "p";
    private static final String OPTION_SARTREE_PATH = "t";
    private static final String OPTION_SCORED_SAR_PATH = "s";
    private static final String OPTION_HELP = "h";

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_PREDICTION_CORPUS).argName("input corpus path")
                    .desc("The absolute path to the input prediction corpus.").hasArg().longOpt("input-corpus-path")
                    .required());
            add(Option.builder(OPTION_POSITIVE_INCHIS).argName("positive inchis file")
                    .desc("The path to a file of positive inchis from LCMS analysis of the prediction corpus.")
                    .hasArg().longOpt("input-positive-inchis").required());
            add(Option.builder(OPTION_SARTREE_PATH).argName("sartree path")
                    .desc("The path to which to write the intermediate file produced by structure clustering.")
                    .hasArg().longOpt("sartree-path").required());
            add(Option.builder(OPTION_SCORED_SAR_PATH).argName("scored sar path")
                    .desc("The path to which to write the final output file of scored sars.").hasArg()
                    .longOpt("scored-sar-path").required());
            add(Option.builder(OPTION_HELP).argName("help").desc("Prints this help message.").longOpt("help"));
        }
    };

    public static final String HELP_MESSAGE = "This class is used to build sars from an L2Prediction run and LCMS analysis results.  The inputs are an "
            + "L2PredictionCorpus and a file with all the product inchis that came up as positive in the LCMS analysis. "
            + "The output is a list of Sars with percentageHits scores based on how predictive they are of the "
            + "reactions in the corpus.";

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    // This heuristically balances the consideration of wanting high percentage to rank
    // highly, but not wanting those with only a couple of total matches to rank highly,
    private static final SarTreeNode.ScoringFunctions SAR_SCORING_FUNCTION = SarTreeNode.ScoringFunctions.HIT_MINUS_MISS;

    private static final Integer THRESHOLD_TREE_SIZE = 2; // any SAR that is not simply one specific substrate is allowed

    public static void main(String[] args) throws Exception {

        // Build command line parser.
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }
        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            LOGGER.error("Argument parsing failed: %s", e.getMessage());
            exitWithHelp(opts);
        }

        // Print help.
        if (cl.hasOption(OPTION_HELP)) {
            HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        File inputCorpusFile = new File(cl.getOptionValue(OPTION_PREDICTION_CORPUS));
        File positiveInchisFile = new File(cl.getOptionValue(OPTION_POSITIVE_INCHIS));
        File sartreeFile = new File(cl.getOptionValue(OPTION_SARTREE_PATH));
        File scoredSarFile = new File(cl.getOptionValue(OPTION_SCORED_SAR_PATH));

        JavaRunnable clusterer = getClusterer(inputCorpusFile, sartreeFile);
        JavaRunnable scorer = getSarScorer(inputCorpusFile, sartreeFile, positiveInchisFile, scoredSarFile,
                SAR_SCORING_FUNCTION, THRESHOLD_TREE_SIZE);

        LOGGER.info("Running clustering.");
        clusterer.run();

        LOGGER.info("Running sar scoring.");
        scorer.run();

        LOGGER.info("Complete!.");
    }

    private static void exitWithHelp(Options opts) {
        HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
        System.exit(1);
    }

    /**
     * Reads in a prediction corpus, containing only one RO's predictions, and builds a clustering tree of the
     * substrates. Returns every SarTreeNode in the tree.
     * This method is static because it does not rely on any properties of the enclosing class to construct the job.
     * TODO: It would probably make more sense to make this its own class, i.e. <Clusterer implements JavaRunnable>
     *
     * @param predictionCorpusInput The prediction corpus input file.
     * @param sarTreeNodesOutput The file to which to write the SarTreeNodeList of every node in the clustering tree.
     * @return A JavaRunnable to run the appropriate clustering.
     */
    public static JavaRunnable getClusterer(File predictionCorpusInput, File sarTreeNodesOutput) {

        return new JavaRunnable() {
            @Override
            public void run() throws IOException {
                // Verify input and output files
                FileChecker.verifyInputFile(predictionCorpusInput);
                FileChecker.verifyAndCreateOutputFile(sarTreeNodesOutput);

                // Build input corpus and substrate list
                L2PredictionCorpus inputCorpus = L2PredictionCorpus
                        .readPredictionsFromJsonFile(predictionCorpusInput);
                // Get list of molecules, tagged by PredictionId so we can track back to the corresponding predictions later on.
                Collection<Molecule> molecules = importMoleculesWithPredictionIds(inputCorpus);

                // Run substrate clustering
                SarTree sarTree = new SarTree();
                try {
                    sarTree.buildByClustering(new LibraryMCS(), molecules);
                } catch (InterruptedException e) {
                    LOGGER.error("Threw interrupted exception during buildByClustering: %s", e.getMessage());
                    throw new RuntimeException(e);
                }

                // Write output to file
                SarTreeNodeList nodeList = new SarTreeNodeList(new ArrayList<>(sarTree.getNodes()));
                nodeList.writeToFile(sarTreeNodesOutput);
            }

            @Override
            public String toString() {
                return "SarClusterer:" + predictionCorpusInput.getName();
            }

            /**
             * Imports the substrate molecules from the prediction corpus, and tags them with their prediction IDs from
             * the corpus, using Chemaxon's setPropertyObject. These prediction Ids can then be recovered from the molecules
             * later, and mapped back to the original inchis, even if the molecule has been changed so that its inchi is no
             * longer the same. In particular, LibraryMCS strips stereo and other layers from the inchis, which would make
             * it hard to track LCMS hit sand misses between the output molecules of LibraryMCS and the initial inchis fed
             * into LibraryMCS.
             *
             * @param inputCorpus The prediction corpus.
             * @return A list of molecules, tagged with a property objects which contains the corresponding list of
             * prediction ids..
             * @throws MolFormatException If a molecule cannot be imported from an inchi.
             */
            private Collection<Molecule> importMoleculesWithPredictionIds(L2PredictionCorpus inputCorpus)
                    throws MolFormatException {
                Map<String, Molecule> inchiToMoleculeMap = new HashMap<>();
                for (L2Prediction prediction : inputCorpus.getCorpus()) {
                    for (String substrateInchi : prediction.getSubstrateInchis()) { // For now this should only be one substrate
                        if (!inchiToMoleculeMap.containsKey(substrateInchi)) {
                            Molecule mol = importMoleculeWithPredictionId(substrateInchi, prediction.getId());
                            inchiToMoleculeMap.put(substrateInchi, mol);
                        } else {
                            List<Integer> predictionIds = (ArrayList<Integer>) inchiToMoleculeMap
                                    .get(substrateInchi).getPropertyObject(SarTreeNode.PREDICTION_ID_KEY);
                            predictionIds.add(prediction.getId());
                        }
                    }
                }
                return inchiToMoleculeMap.values();
            }

            private Molecule importMoleculeWithPredictionId(String inchi, Integer id) throws MolFormatException {
                Molecule mol = MolImporter.importMol(inchi, "inchi");
                List<Integer> predictionIds = new ArrayList<>();
                predictionIds.add(id);
                mol.setPropertyObject(SarTreeNode.PREDICTION_ID_KEY, predictionIds);
                return mol;
            }
        };
    }

    /**
     * Reads in an already-built SarTree from a SarTreeNodeList, and scores the SARs based on LCMS results.  Currently
     * LCMS results are a dummy function that randomly classifies molecules as hits and misses.
     * This method is static because it does not rely on any properties of the enclosing class to construct the job.
     * TODO: It would probably make more sense to make this its own class, i.e. <SarScorer implements JavaRunnable>
     *
     * @param sarTreeInput An input file containing a SarTreeNodeList with all Sars from the clustering tree.
     * @param lcmsInput File with LCMS hits.
     * @param sarTreeNodeOutput The output file to which to write the relevant SARs from the corpus, sorted in decreasing
     * order of confidence.
     * @param scoringFunction The function used to score and rank the SARs.
     * @param subtreeThreshold The minimum number of leaves a sAR should match to be returned.
     * @return A JavaRunnable to run the SAR scoring.
     */
    public static JavaRunnable getSarScorer(File predictionsFile, File sarTreeInput, File lcmsInput,
            File sarTreeNodeOutput, SarTreeNode.ScoringFunctions scoringFunction, Integer subtreeThreshold) {

        return new JavaRunnable() {
            @Override
            public void run() throws IOException {
                // Verify input and output files
                FileChecker.verifyInputFile(predictionsFile);
                FileChecker.verifyInputFile(lcmsInput);
                FileChecker.verifyInputFile(sarTreeInput);
                FileChecker.verifyAndCreateOutputFile(sarTreeNodeOutput);

                L2PredictionCorpus predictionCorpus = L2PredictionCorpus
                        .readPredictionsFromJsonFile(predictionsFile);

                // Build SAR tree from input file
                SarTreeNodeList nodeList = new SarTreeNodeList();
                nodeList.loadFromFile(sarTreeInput);
                SarTree sarTree = new SarTree();
                nodeList.getSarTreeNodes().forEach(node -> sarTree.addNode(node));

                // Build LCMS results
                IonAnalysisInterchangeModel lcmsResults = new IonAnalysisInterchangeModel();
                lcmsResults.loadResultsFromFile(lcmsInput);

                // Build sar scorer from LCMS and prediction corpus
                SarTreeBasedCalculator sarScorer = new SarTreeBasedCalculator(sarTree, predictionCorpus,
                        lcmsResults);

                // Score SARs
                // Calculate hits and misses
                sarTree.applyToNodes(sarScorer, subtreeThreshold);
                // Retain nodes that are not repeats or leaves, and have more than 0 LCMS hits
                SarTreeNodeList treeNodeList = sarTree.getExplanatoryNodes(subtreeThreshold, 0.0);
                // Sort by the supplied scoring function.
                treeNodeList.sortBy(scoringFunction);

                // Write out output.
                treeNodeList.writeToFile(sarTreeNodeOutput);
            }

            @Override
            public String toString() {
                return "SarScorer:" + sarTreeInput.getName();
            }
        };
    }
}