com.act.reachables.LoadAct.java Source code

Java tutorial

Introduction

Here is the source code for com.act.reachables.LoadAct.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.reachables;

import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Chemical.REFS;
import act.shared.Reaction;
import com.mongodb.BasicDBObject;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.json.JSONArray;
import org.json.JSONObject;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class LoadAct extends SteppedTask {
    // Constants
    private static String _fileloc = "com.act.reachables.LoadAct";

    private static final boolean SET_METADATA_ON_NW_NODES = false;

    private static final String DEFAULT_DB_HOST = "localhost";
    private static final int DEFAULT_PORT = 27017;

    // Fields
    private MongoDB db;
    private Set<String> optional_universal_inchis;
    private Set<String> optional_cofactor_inchis;
    private int loaded, total;
    private List<String> fieldSetForChemicals;

    private LoadAct(String dbToUse, Set<String> optional_universal_inchis, Set<String> optional_cofactor_inchis) {
        this.optional_universal_inchis = optional_universal_inchis;
        this.optional_cofactor_inchis = optional_cofactor_inchis;
        this.fieldSetForChemicals = new ArrayList<>();
        this.db = new MongoDB(DEFAULT_DB_HOST, DEFAULT_PORT, dbToUse);

        ActData.instance().Act = new Network("Act");
        ActData.instance().ActTree = new Network("Act Tree");

        // the following take time, so will be done in init();
        ActData.instance().allrxnids = null;
        ActData.instance().cofactors = null;
        ActData.instance().natives = null;
        ActData.instance().chemInchis = null;
        ActData.instance().chemId2Inchis = null;
        ActData.instance().chemId2ReadableName = null;
        ActData.instance().chemToxicity = null;
        ActData.instance().chemsReferencedInRxns = null;

        GlobalParams._hostOrganismIDs = new Long[GlobalParams._hostOrganisms.length];
        for (int i = 0; i < GlobalParams._hostOrganisms.length; i++) {
            GlobalParams._hostOrganismIDs[i] = this.db.getOrganismId(GlobalParams._hostOrganisms[i]);
        }

        total = -1;
        loaded = -1;
    }

    public static Network getReachablesTree(String dbToUse, Set<String> natives, Set<String> cofactors,
            boolean restrictToSeq, String[] extra_chem_fields) {
        GlobalParams._actTreeOnlyIncludeRxnsWithSequences = restrictToSeq;

        // init loader
        LoadAct act = new LoadAct(dbToUse, natives, cofactors);

        // set fields to include in the tree even if they are not reachables
        if (extra_chem_fields != null)
            for (String f : extra_chem_fields)
                act.setFieldForExtraChemicals(f);

        // load data from db; and compute reachables tree
        act.run();

        return ActData.instance().ActTree;
    }

    public static Network getReachablesTree(String dbToUse, Set<String> natives, Set<String> cofactors,
            boolean restrictToSeq) {
        return getReachablesTree(dbToUse, natives, cofactors, restrictToSeq, null);
    }

    public static Network getReachablesTree(String dbToUse, Set<String> natives, Set<String> cofactors) {
        return getReachablesTree(dbToUse, natives, cofactors, true);
    }

    public static String toInChI(Long id) {
        return ActData.instance().chemId2Inchis.get(id);
    }

    private static void addToNw(Reaction rxn) {
        /**
         * This class takes in a reaction and adds it to ActData.
         *
         * Special handling is afforded to abstract reactions and reactions w/ only cofactors as substrates.
         */
        Long rxnid = (long) rxn.getUUID();

        // Filter out abstract substrates.  These reactions are invalid
        Long[] inputChemicalsArray = ArrayUtils.addAll(rxn.getSubstrates(), rxn.getSubstrateCofactors());

        // Some reactions have coenzymes, so don't forget about those.
        inputChemicalsArray = ArrayUtils.addAll(inputChemicalsArray, rxn.getCoenzymes());

        if (Arrays.stream(inputChemicalsArray).anyMatch(x -> ActData.instance().metaCycBigMolsOrRgrp.contains(x))) {
            logProgress(String.format("Skipping reaction %d as it contains an abstract substrate.", rxnid));
            return;
        }

        // Remove cofactors from the needed substrates.
        List<Long> inputChemicals = Arrays.asList(inputChemicalsArray).stream()
                .filter(c -> !ActData.instance().cofactors.contains(c)).collect(Collectors.toList());
        Long[] outputChemicalsArray = ArrayUtils.addAll(rxn.getProducts(), rxn.getProductCofactors());

        // Output chemicals, only real ones. Also remove all of the cofactors as we don't care if those are produced.
        List<Long> outputChemicals = Arrays.asList(outputChemicalsArray).stream()
                .filter(p -> !ActData.instance().metaCycBigMolsOrRgrp.contains(p)
                        && !ActData.instance().cofactors.contains(p))
                .collect(Collectors.toList());

        // Don't bother if we don't get any new chemicals from this.
        if (outputChemicals.isEmpty()) {
            logProgress(String.format("Skipping reaction %d as it has no non-abstract products.", rxnid));
            return;
        }

        if (inputChemicals.isEmpty()) {
            logProgress(String.format(
                    "Undoing filtering on reaction %d substrates as it has only cofactor substrates.", rxnid));
            // These will be substrate only
            inputChemicals = Arrays.asList(inputChemicalsArray);
        }

        // Load all the nodeMapping.
        inputChemicals.stream().distinct().forEach(s -> {
            Node sub = Node.get(s, true);
            ActData.instance().chemsInAct.put(s, sub);
            ActData.instance().Act.addNode(sub, s);

            outputChemicals.stream().distinct().forEach(p -> {
                Node prd = Node.get(p, true);
                ActData.instance().Act.addNode(prd, p);
                ActData.instance().chemsInAct.put(p, prd);
                Edge r = Edge.get(sub, prd, true);
                ActData.instance().Act.addEdge(r);
            });
        });

        Set<Long> inputSet = new HashSet<>(inputChemicals);
        Set<Long> outputSet = new HashSet<>(outputChemicals);

        /* --------------- Update ActData ------------------ */
        ActData.instance().rxnSubstrates.put(rxnid, inputSet);
        ActData.instance().rxnProducts.put(rxnid, outputSet);

        inputSet.stream().forEach(ActData.instance().chemsReferencedInRxns::add);
        outputSet.stream().forEach(ActData.instance().chemsReferencedInRxns::add);

        inputSet.stream().forEach(s -> {
            Set<Long> pSet = ActData.instance().rxnsThatConsumeChem.getOrDefault(s, new HashSet<>());
            pSet.add(rxnid);
            ActData.instance().rxnsThatConsumeChem.put(s, pSet);
        });

        outputSet.stream().forEach(p -> {
            Set<Long> pSet = ActData.instance().rxnsThatProduceChem.getOrDefault(p, new HashSet<>());
            pSet.add(rxnid);
            ActData.instance().rxnsThatProduceChem.put(p, pSet);
        });

        // add to internal copy of network
        ActData.instance().rxnHasSeq.put(rxnid, rxn.hasProteinSeq());

        /* -------- Reaction Classes ------- */
        Pair<Set<Long>, Set<Long>> rxnClass = Pair.of(inputSet, outputSet);
        if (!ActData.instance().rxnClasses.contains(rxnClass)) {
            // the first reaction that shows up in this class, get to
            // represent the entire class. So we install it in the
            // datasets mirroring the non-class structures...

            ActData.instance().rxnClassesSubstrates.put(rxnid, inputSet);
            ActData.instance().rxnClassesProducts.put(rxnid, outputSet);
            ActData.instance().rxnClasses.add(rxnClass);
        }
    }

    public static void annotateRxnEdges(Reaction rxn, HashSet<Edge> rxn_edges) {
        for (Edge e : rxn_edges) {
            Edge.setAttribute(e, "isRxn", true);
            Edge.setAttribute(e, "datasource", rxn.getDataSource());
            Edge.setAttribute(e, "srcRxnID", rxn.getUUID());
            Edge.setAttribute(e, "srcRxn", rxn.getReactionName());
            if (rxn.getECNum() != null)
                Edge.setAttribute(e, "srcRxnEC", rxn.getECNum());
        }
    }

    public static boolean isCofactor(long m) {
        return ActData.instance().cofactors.contains(m);
    }

    private static void logProgress(String format, Object... args) {
        if (!GlobalParams.LOG_PROGRESS)
            return;

        System.err.format(_fileloc + ": " + format, args);
    }

    private static void logProgress(String msg) {
        if (!GlobalParams.LOG_PROGRESS)
            return;

        System.err.println(_fileloc + ": " + msg);
    }

    private List<Long> getAllIDsSorted() {
        List<Long> allids = this.db.getAllReactionUUIDs();
        Collections.sort(allids);
        return allids;
    }

    private Set<Long> getNatives() {
        Set<Long> natives_ids = new HashSet<>();

        if (this.optional_universal_inchis == null) {

            // pull whatever is in the DB
            List<Chemical> cs = this.db.getNativeMetaboliteChems();
            for (Chemical c : cs)
                natives_ids.add(c.getUuid());

        } else {

            // use the inchis provided to the constructor
            for (String inchi : this.optional_universal_inchis) {
                Chemical c = this.db.getChemicalFromInChI(inchi);

                if (c == null) {
                    logProgress("LoadAct: WARNING: Starting native not in db.");
                    logProgress("LoadAct:        : InChI = " + inchi);
                    continue;
                }

                natives_ids.add(c.getUuid());
            }
        }

        logProgress(String.format("%d natives loaded into Act", natives_ids.size()));

        return natives_ids;
    }

    private Set<Long> getCofactors() {
        Set<Long> cofactor_ids = new HashSet<>();

        if (this.optional_cofactor_inchis == null) {

            // pull whatever is in the DB
            List<Chemical> cs = this.db.getCofactorChemicals();
            // TODO Add filter for chemicals w/o inchis
            for (Chemical c : cs)
                if (c.getInChI() != null) {
                    cofactor_ids.add(c.getUuid());
                }

        } else {

            // use the inchis provided to the constructor
            for (String inchi : this.optional_cofactor_inchis) {
                Chemical c = this.db.getChemicalFromInChI(inchi);

                if (c == null) {
                    logProgress("LoadAct: SEVERE WARNING: Starting cofactor not in db.");
                    logProgress("LoadAct:               : InChI = " + inchi);
                    continue;
                }

                cofactor_ids.add(c.getUuid());
            }
        }

        return cofactor_ids;
    }

    public void setFieldForExtraChemicals(String f) {
        this.fieldSetForChemicals.add(f);
    }

    private HashMap<String, List<Long>> getChemicalWithUserSpecFields() {
        HashMap<String, List<Long>> specials = new HashMap<>();

        for (String f : this.fieldSetForChemicals) {
            List<Chemical> cs = this.db.getChemicalsThatHaveField(f);
            specials.put(f, extractChemicalIDs(cs));
        }
        return specials;
    }

    private Set<Long> getMetaCycBigMolsOrRgrp() {
        HashSet<Long> ids = new HashSet<>();
        DBIterator chemCursor = this.db.getIdCursorForFakeChemicals();
        while (chemCursor.hasNext()) {
            ids.add((Long) chemCursor.next().get("_id"));
        }
        chemCursor.close();
        return ids;
    }

    private List<Long> extractChemicalIDs(List<Chemical> cs) {
        List<Long> cids = new ArrayList<Long>();
        for (Chemical c : cs)
            cids.add(c.getUuid());
        return cids;
    }

    private void addReactionsToNetwork() {
        // Ignore reactions with "protein" in their description. This is only necessary because there are some problems in
        // data parsing where we don't always put the proteins into the data, so sometimes fake data looks real.
        BasicDBObject noFakeReactions = new BasicDBObject("easy_desc",
                new BasicDBObject("$regex", "^((?!protein).)*$"));
        DBIterator iterator = this.db.getIteratorOverReactions(noFakeReactions, null);
        Reaction r;
        Map<Reaction.RxnDataSource, Integer> counts = new HashMap<>();
        for (Reaction.RxnDataSource src : Reaction.RxnDataSource.values())
            counts.put(src, 0);
        // since we are iterating until the end,
        // the getNextReaction call will close the DB cursor...

        while ((r = this.db.getNextReaction(iterator)) != null) {
            // this rxn comes from a datasource, METACYC, BRENDA or KEGG.
            // ensure the configuration tells us to include this datasource...
            Reaction.RxnDataSource src = r.getDataSource();
            Set<Reaction> reactionsWithAccurateDirections = r.correctForReactionDirection();
            counts.put(src, counts.get(src) + reactionsWithAccurateDirections.size());

            // Correct for right-to-left and reversible actions, adding all appropriate directions to the graph.
            for (Reaction directedRxn : reactionsWithAccurateDirections) {
                addToNw(directedRxn);
            }

        }
        logProgress("");

        logProgress("Rxn aggregate into %d classes.\n", ActData.instance().rxnClasses.size());
    }

    @Override
    public double percentDone() {
        return 100.0 * ((double) this.loaded / this.total);
    }

    @Override
    public void doMoreWork() {
        logProgress("Pulling %d reactions from MongoDB:\n", this.total);
        addReactionsToNetwork();
        this.loaded = this.total;
    }

    @Override
    public void init() {
        ActData.instance().allrxnids = getAllIDsSorted();
        total = ActData.instance().allrxnids.size();

        loaded = 0;
        ActData.instance().cofactors = getCofactors();
        ActData.instance().natives = getNatives();
        ActData.instance().chemicalsWithUserField = getChemicalWithUserSpecFields();
        ActData.instance().chemicalsWithUserField_treeOrganic = new HashSet<>();
        ActData.instance().chemicalsWithUserField_treeArtificial = new HashSet<>();
        ActData.instance().metaCycBigMolsOrRgrp = getMetaCycBigMolsOrRgrp();
        ActData.instance().chemsReferencedInRxns = new HashSet<>();
        ActData.instance().chemsInAct = new HashMap<>();
        ActData.instance().rxnsInAct = new HashMap<>();
        ActData.instance().rxnSubstrates = new HashMap<>();
        ActData.instance().rxnsThatConsumeChem = new HashMap<>();
        ActData.instance().rxnsThatProduceChem = new HashMap<>();
        ActData.instance().rxnProducts = new HashMap<>();
        ActData.instance().rxnOrganisms = new HashMap<>();
        ActData.instance().rxnSubstratesCofactors = new HashMap<>();
        ActData.instance().rxnProductsCofactors = new HashMap<>();
        ActData.instance().rxnHasSeq = new HashMap<>();

        ActData.instance().rxnClassesSubstrates = new HashMap<>();
        ActData.instance().rxnClassesProducts = new HashMap<>();
        ActData.instance().rxnClasses = new HashSet<>();
        ActData.instance().rxnClassesThatConsumeChem = new HashMap<>();
        ActData.instance().rxnClassesThatProduceChem = new HashMap<>();

        ActData.instance().noSubstrateRxnsToProducts = new HashMap<>();
        logProgress("Initialization complete.");
    }

    @Override
    public void finalize(TaskMonitor tm) {
        logProgress("ComputeReachablesTree.. starting");
        ActData.instance().chemToxicity = new HashMap<Long, Set<Integer>>();
        ActData.instance().chemInchis = new HashMap<String, Long>();
        ActData.instance().chemId2Inchis = new HashMap<Long, String>();
        ActData.instance().chemId2ReadableName = new HashMap<Long, String>();
        ActData.instance().chemIdIsAbstraction = new HashMap<Long, Boolean>();

        pullChemicalsReferencedInRxns();

        ActData.instance().chemsReferencedInRxns.addAll(ActData.instance().cofactors);
        ActData.instance().chemsReferencedInRxns.addAll(ActData.instance().natives);

        for (String f : ActData.instance().chemicalsWithUserField.keySet())
            ActData.instance().chemsReferencedInRxns.addAll(ActData.instance().chemicalsWithUserField.get(f));

        // computes reachables tree and writes it into ActData.instance().ActTree
        new ComputeReachablesTree(this.db);
    }

    private void pullChemicalsReferencedInRxns() {
        int N = ActData.instance().chemsReferencedInRxns.size();
        int count = 0;
        logProgress("Extracting metadata from chemicals.");
        for (Long id : ActData.instance().chemsReferencedInRxns) {
            logProgress("\t pullChemicalsReferencedInRxns: %d\r", count++);
            Chemical c = this.db.getChemicalFromChemicalUUID(id);
            ActData.instance().chemInchis.put(c.getInChI(), id);
            ActData.instance().chemId2Inchis.put(id, c.getInChI());
            ActData.instance().chemIdIsAbstraction.put(id, isAbstractInChI(c.getInChI()));
            String name = c.getShortestBRENDAName();
            if (name == null) {
                // see if there is a metacyc name:
                Object meta = c.getRef(Chemical.REFS.METACYC, new String[] { "meta" });
                if (meta != null) {
                    // if we are here, the entry was referenced in metacyc, so must
                    // have some name association there. see if we can pull that out.

                    // the xref.METACYC.meta field should *always* be a JSONArray
                    // (even if its an array with a single JSONObject within it)
                    // sanity check that, and abort if the type does not match
                    if (!(meta instanceof JSONArray)) {
                        throw new RuntimeException(
                                "Expect only Arrays in db.chemicals.{xref.METACYC.meta}, but found: "
                                        + meta.getClass());
                    }

                    name = ((JSONObject) ((JSONArray) meta).get(0)).getString("sname");

                    // if failed to pull out a name from metacyc, report it
                    if (name == null)
                        System.out
                                .println("ERROR: Looks like a metacyc entry chemical, but no metacyc name: " + id);
                }
            }
            if (name == null) {
                // Try to use wikipedia
                Object meta = c.getRef(Chemical.REFS.WIKIPEDIA, new String[] { "metadata" });
                if (meta != null) {
                    if (!(meta instanceof JSONObject)) {
                        throw new RuntimeException("Unable to parse Wikipedia metadata.");
                    }
                    name = (String) ((JSONObject) meta).get("article");
                }
            }
            if (name == null) {
                // stuff the inchi into the name
                name = c.getInChI();
            }
            ActData.instance().chemId2ReadableName.put(id, name);

            if (SET_METADATA_ON_NW_NODES) {
                String[] xpath = { "metadata", "toxicity" };
                Object o = c.getRef(REFS.DRUGBANK, xpath);
                if (o == null || !(o instanceof String))
                    continue;
                Set<Integer> ld50s = extractLD50vals((String) o);
                ActData.instance().chemToxicity.put(id, ld50s);

                // set chemical attributes
                String txt = null; // D MongoDB.chemicalAsString(c, id);
                Set<Integer> tox = ActData.instance().chemToxicity.get(id);
                Long n1 = ActData.instance().chemsInAct.get(id).getIdentifier();
                int fanout = ActData.instance().rxnsThatConsumeChem.containsKey(id)
                        ? ActData.instance().rxnsThatConsumeChem.get(id).size()
                        : -1;
                int fanin = ActData.instance().rxnsThatProduceChem.containsKey(id)
                        ? ActData.instance().rxnsThatProduceChem.get(id).size()
                        : -1;

                setMetadata(n1, tox, c, txt, fanout, fanin);

            }
        }
        logProgress("");
    }

    private boolean isAbstractInChI(String inchi) {
        // Create a Pattern object
        Pattern r = Pattern.compile("^InChI=1S\\/[A-Z0-9]*R");
        return r.matcher(inchi).find();
    }

    private Set<Integer> extractLD50vals(String annotation) {
        // an example of what we want to process is: "Oral, mouse: LD50 = 338 mg/kg; Oral, rat: LD50 = 1944 mg/kg"
        // a second example of what to process is:   "Acute oral toxicity (LD<sub>50</sub>) in rats is 264 mg/kg."
        // a third example of what to process is :   "Oral mouse and rat LD<sub>50</sub> are 338 mg/kg and 425 mg/kg respectively"
        // a fourth example of what to process is:   "oral LD<sub>50</sub>s were 1,100-1,550 mg/kg; 1,450 mg/kg; and 1,490 mg/kg; respectively"

        // most of time it is specified as mg/kg but rarely we also have g/kg: "Oral LD50 in rat: >5 g/kg"
        Set<Integer> ld50s = new HashSet<Integer>();
        int idx = 0;
        int len = annotation.length();
        String[] locator = { "LD50", "LD<sub>50</sub>" };
        int[] locator_len = { locator[0].length(), locator[1].length() };
        int delta = 60; // 60 chars +- the locator
        for (int l = 0; l < locator.length; l++) {
            while ((idx = annotation.indexOf(locator[l], idx)) != -1) {
                // look around a few tokens to find numbers that we can use...
                int low = idx - delta < 0 ? 0 : idx - delta;
                int high = idx + delta > len ? len : idx + delta;
                String sub = annotation.substring(low, idx) + annotation.substring(idx + locator_len[l], high);
                Scanner scan = new Scanner(sub).useDelimiter("[^0-9,]+");
                while (scan.hasNext()) {
                    String scanned = scan.next().replaceAll(",", "");
                    try {
                        ld50s.add(Integer.parseInt(scanned));
                    } catch (NumberFormatException e) {
                    }
                }

                idx++; // so that we skip the occurrence that we just added
            }
        }
        return ld50s;
    }

    private void setMetadata(Long n, Set<Integer> tox, Chemical c, String fulltxt, int fanout, int fanin) {
        Node.setAttribute(n, "isNative", c.isNative());
        Node.setAttribute(n, "fanout", fanout);
        Node.setAttribute(n, "fanin", fanin);
        if (c.getCanon() != null)
            Node.setAttribute(n, "canonical", c.getCanon());
        if (c.getInChI() != null)
            Node.setAttribute(n, "InChI", c.getInChI());
        if (c.getSmiles() != null)
            Node.setAttribute(n, "SMILES", c.getSmiles());
        if (c.getShortestName() != null)
            Node.setAttribute(n, "Name", c.getShortestName());
        if (c.getBrendaNames() != null && c.getSynonyms() != null)
            Node.setAttribute(n, "Synonyms", c.getBrendaNames().toString() + c.getSynonyms().toString());
        setXrefs(n, c);
        if (fulltxt != null)
            Node.setAttribute(n, "fulltxt", fulltxt);
        if (tox != null && tox.size() > 0) {
            Node.setAttribute(n, "toxicity_all", tox.toString());
            int max = Integer.MIN_VALUE, min = Integer.MAX_VALUE;
            for (int i : tox) {
                max = max < i ? i : max;
                min = min > i ? i : min;
            }
            Node.setAttribute(n, "toxicity_min", min);
            Node.setAttribute(n, "toxicity_max", max);
        }
    }

    private void setXrefs(Long node, Chemical c) {
        for (REFS typ : Chemical.REFS.values()) {
            if (c.getRef(typ) != null) {
                Double valuation = c.getRefMetric(typ);
                Node.setAttribute(node, typ.name(), c.getRef(typ).toString());
                Node.setAttribute(node, "metric" + typ.name(), valuation == null ? -999999999.0 : valuation);
                Node.setAttribute(node, "log10metric" + typ.name(),
                        valuation == null ? -99.0 : Math.log10(valuation));
                Node.setAttribute(node, "has" + typ.name(), true);
            } else {
                Node.setAttribute(node, "has" + typ.name(), false);
            }
        }
    }

}