com.act.reachables.ConditionalReachabilityInterpreter.java Source code

Java tutorial

Introduction

Here is the source code for com.act.reachables.ConditionalReachabilityInterpreter.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.reachables;

import act.installer.bing.BingSearchRanker;
import act.server.NoSQLAPI;
import act.shared.Reaction;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class ConditionalReachabilityInterpreter {

    private static final String GLUCOSE_INCHI = "InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1";
    private static final String ATP_INCHI = "InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(17)6(16)4(26-10)1-25-30(21,22)28-31(23,24)27-29(18,19)20/h2-4,6-7,10,16-17H,1H2,(H,21,22)(H,23,24)(H2,11,12,13)(H2,18,19,20)/t4-,6-,7-,10-/m1/s1";
    private static final Set<String> BLACKLISTED_ROOT_INCHIS = new HashSet<String>() {
        {
            add(GLUCOSE_INCHI);
            add(ATP_INCHI);
        }
    };
    public static final String OPTION_OUTPUT_FILEPATH = "o";
    public static final String OPTION_INPUT_ACT_FILEPATH = "i";
    public static final String OPTION_DB_NAME = "d";
    private static final Logger LOGGER = LogManager.getFormatterLogger(ConditionalReachabilityInterpreter.class);

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class is used to deserialize a reachable forest and output bing search results of all chemicals within each root",
            "of the forest along with it's root associate." }, " ");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_OUTPUT_FILEPATH).argName("OUTPUT_FILEPATH")
                    .desc("The full path to the output file").hasArg().required().longOpt("output_filepath")
                    .type(String.class));
            add(Option.builder(OPTION_INPUT_ACT_FILEPATH).argName("INPUT_ACT_FILEPATH")
                    .desc("The full path to the input act file").hasArg().required().longOpt("input_act_filepath")
                    .type(String.class));
            add(Option.builder(OPTION_DB_NAME).argName("DB_NAME").desc("The name of the database").hasArg()
                    .required().longOpt("db_name").type(String.class));
            add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help"));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    // Instance variables
    private ActData actData;
    private Set<Long> rootChemicals;
    private Map<Long, String> chemIdToInchi;
    private Map<String, Integer> depthOfMolecule;
    private NoSQLAPI db = new NoSQLAPI("marvin", "marvin");

    public ConditionalReachabilityInterpreter(ActData actData, NoSQLAPI db) {
        this.actData = actData;
        this.rootChemicals = new HashSet<>();
        this.chemIdToInchi = new HashMap<>();
        this.depthOfMolecule = new HashMap<>();
        this.db = db;
    }

    public static void main(String[] args) throws Exception {
        // Parse the command line options
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(BingSearchRanker.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(BingSearchRanker.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        String inputPath = cl.getOptionValue(OPTION_INPUT_ACT_FILEPATH);
        String outputPath = cl.getOptionValue(OPTION_OUTPUT_FILEPATH);
        String dbName = cl.getOptionValue(OPTION_DB_NAME);

        LOGGER.info("Starting to deserialize reachables forest.");
        ActData.instance().deserialize(inputPath);
        ActData actData = ActData.instance();
        LOGGER.info("Finished deserializing reachables forest.");

        NoSQLAPI db = new NoSQLAPI(dbName, dbName);
        ConditionalReachabilityInterpreter conditionalReachabilityInterpreter = new ConditionalReachabilityInterpreter(
                actData, db);
        conditionalReachabilityInterpreter.run(outputPath);
    }

    /**
     * This function constructs parent to children associations, while finding root chemicals from the reachables forest.
     * @return parent to child associations
     */
    private Map<Long, Set<Long>> constructParentToChildAssociationsAndPopulateRootChemicals() {
        Map<Long, Set<Long>> parentToChildrenAssociations = new HashMap<>();
        for (Map.Entry<Long, Long> childIdToParentId : this.actData.getActTree().parents.entrySet()) {
            Long parentId = childIdToParentId.getValue();
            Long childId = childIdToParentId.getKey();

            // If the parentId is null, that means the node is one of the roots of the forest.
            if (parentId == null) {
                rootChemicals.add(childId);
                continue;
            }

            Set<Long> childIds = parentToChildrenAssociations.get(parentId);
            if (childIds == null) {
                childIds = new HashSet<>();
                parentToChildrenAssociations.put(parentId, childIds);
            }
            childIds.add(childId);
        }

        return parentToChildrenAssociations;
    }

    /**
     * This function constructs root to descendant mappings, creating a representation of the forest that is easy to
     * traverse.
     * @param parentToDescendantsAssociations A mapping between parent id to a set of all it's children.
     * @return a mapping of root id to all its descendant ids.
     */
    private Map<Long, Set<Long>> constructRootToDescendantMappings(
            Map<Long, Set<Long>> parentToDescendantsAssociations) {
        Map<Long, Set<Long>> rootToSetOfDescendants = new HashMap<>();
        for (Long rootId : rootChemicals) {
            // Record depth of each tree
            int depth = 1;
            String rootInchi = db
                    .readChemicalFromInKnowledgeGraph(rootId < 0 ? Reaction.reverseNegativeId(rootId) : rootId)
                    .getInChI();
            chemIdToInchi.put(rootId, rootInchi);

            Set<Long> children = parentToDescendantsAssociations.get(rootId);
            while (children != null && children.size() > 0) {
                Set<Long> descendants = rootToSetOfDescendants.get(rootId);
                if (descendants == null) {
                    descendants = new HashSet<>();
                    rootToSetOfDescendants.put(rootId, descendants);
                }
                descendants.addAll(children);

                /**
                 * Record depth for each member of children and construct a Set newChildren which is the set of all children
                 * of the variable children.
                 */
                Set<Long> newChildren = new HashSet<>();
                for (Long child : children) {
                    String childInchi = chemIdToInchi.get(child);
                    if (childInchi == null) {
                        childInchi = db.readChemicalFromInKnowledgeGraph(
                                child < 0 ? Reaction.reverseNegativeId(child) : child).getInChI();
                        chemIdToInchi.put(child, childInchi);
                    }

                    // Since a child is only associated with one parent, we can simply record it's depth from that root without
                    // worrying about possible collisions with other roots as parents as none exist.
                    depthOfMolecule.put(childInchi, depth);

                    Set<Long> childrenOfChil = parentToDescendantsAssociations.get(child);
                    if (childrenOfChil != null) {
                        newChildren.addAll(childrenOfChil);
                    }
                }

                children = newChildren;
                depth++;
            }
        }

        return rootToSetOfDescendants;
    }

    /**
     * This function constructs the conditional reachability forest, from each root to its descendants, and passes that
     * structure to the bing search results of chemical ranking. Based on the ranking, we output a tsv file for each
     * molecule that is conditionally reachable, its root and bing search metadata.
     * @param outputFilePath The output file to write to
     * @throws IOException
     */
    private void run(String outputFilePath) throws IOException {

        LOGGER.info("Create parent to child associations");
        Map<Long, Set<Long>> parentToChildrenAssociations = constructParentToChildAssociationsAndPopulateRootChemicals();

        LOGGER.info("Construct trees from the root chemicals");
        Map<Long, Set<Long>> rootToSetOfDescendants = constructRootToDescendantMappings(
                parentToChildrenAssociations);

        LOGGER.info("Construct reverse mapping from descendant to root chemical");
        Map<String, String> descendantInchiToRootInchi = new HashMap<>();
        for (Map.Entry<Long, Set<Long>> entry : rootToSetOfDescendants.entrySet()) {
            String rootInchi = chemIdToInchi.get(entry.getKey());
            if (BLACKLISTED_ROOT_INCHIS.contains(rootInchi)) {
                continue;
            }
            for (Long descendant : entry.getValue()) {
                // Since a chemical is only added as a child to one specific root, there is not chance for collisions to happen.
                descendantInchiToRootInchi.put(chemIdToInchi.get(descendant), rootInchi);
            }
        }

        Set<String> allInchis = new HashSet<>();
        allInchis.addAll(chemIdToInchi.values());

        LOGGER.info("Add chemicals to bing search results");
        // Update the Bing Search results in the Installer database
        BingSearchRanker bingSearchRanker = new BingSearchRanker();
        bingSearchRanker.addBingSearchResults(allInchis);

        LOGGER.info("Write chemicals to output file");
        bingSearchRanker.writeBingSearchRanksAsTSVUsingConditionalReachabilityFormat(allInchis,
                descendantInchiToRootInchi, depthOfMolecule, outputFilePath);
    }
}