org.twentyn.proteintodna.ProteinToDNADriver.java Source code

Introduction

Here is the source code for org.twentyn.proteintodna.ProteinToDNADriver.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package org.twentyn.proteintodna;

import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Reaction;
import act.shared.Seq;
import com.act.biointerpretation.metadata.ProteinMetadata;
import com.act.biointerpretation.metadata.RankPathway;
import com.act.reachables.Cascade;
import com.act.reachables.ReactionPath;
import com.act.utils.CLIUtil;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoInternalException;
import com.mongodb.ServerAddress;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.mongojack.DBCursor;
import org.mongojack.DBQuery;
import org.mongojack.JacksonDBCollection;
import org.mongojack.WriteResult;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class ProteinToDNADriver {

    private static final Logger LOGGER = LogManager.getFormatterLogger(ProteinToDNADriver.class);

    private static final String DEFAULT_DB_HOST = "localhost";
    private static final String DEFAULT_DB_PORT = "27017";
    private static final String DEFAULT_OUTPUT_DB_NAME = "wiki_reachables";
    private static final String DEFAULT_INPUT_DB_NAME = "SHOULD_COME_FROM_CMDLINE"; // "jarvis_2016-12-09";
    public static final String DEFAULT_INPUT_PATHWAY_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "pathways_jarvis";
    public static final String DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "pathways_vijay";
    public static final String DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "dna_designs";

    private static final String OPTION_DB_HOST = "H";
    private static final String OPTION_DB_PORT = "p";
    private static final String OPTION_OUTPUT_DB_NAME = "o";
    private static final String OPTION_INPUT_DB_NAME = "i";
    private static final String OPTION_INPUT_PATHWAY_COLLECTION_NAME = "c";
    private static final String OPTION_OUTPUT_PATHWAY_COLLECTION_NAME = "d";
    private static final String OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME = "e";
    private static final String OPTION_DESIGN_SOME = "m";
    private static final Integer HIGHEST_SCORING_INFERRED_SEQ_INDEX = 0;
    private static final Set<String> BLACKLISTED_WORDS_IN_INFERRED_SEQ = new HashSet<>(Arrays.asList("Fragment"));

    private static final Pattern REGEX_ID = Pattern.compile("^\\d+$");
    private static final Pattern REGEX_INCHI = Pattern.compile("^InChI=1S?/");
    // Based on https://en.wikipedia.org/wiki/International_Chemical_Identifier#InChIKey
    private static final Pattern REGEX_INCHI_KEY = Pattern.compile("^[A-Z]{14}-[A-Z]{10}-[A-Z]$");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_DB_HOST).argName("hostname")
                    .desc(String.format("The DB host to which to connect (default: %s)", DEFAULT_DB_HOST)).hasArg()
                    .longOpt("db-host"));
            add(Option.builder(OPTION_DB_PORT).argName("port")
                    .desc(String.format("The DB port to which to connect (default: %s)", DEFAULT_DB_PORT)).hasArg()
                    .longOpt("db-port"));
            add(Option.builder(OPTION_OUTPUT_DB_NAME).argName("output-db-name")
                    .desc(String.format("The name of the database to read pathways and write seqs to (default: %s)",
                            DEFAULT_OUTPUT_DB_NAME))
                    .hasArg().longOpt("output-db-name"));
            add(Option.builder(OPTION_INPUT_DB_NAME).argName("input-db-name").desc(String
                    .format("The name of the database to read reactions from (default: %s)", DEFAULT_INPUT_DB_NAME))
                    .hasArg().required().longOpt("source-db-name"));
            add(Option.builder(OPTION_INPUT_PATHWAY_COLLECTION_NAME).argName("collection-name")
                    .desc(String.format("The name of the input pathway collection to read from (default: %s)",
                            DEFAULT_INPUT_PATHWAY_COLLECTION_NAME))
                    .hasArg().required().longOpt("input-pathway-collection"));
            add(Option.builder(OPTION_OUTPUT_PATHWAY_COLLECTION_NAME).argName("collection-name")
                    .desc(String.format("The name of the output pathway collection to write to (default: %s)",
                            DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME))
                    .hasArg().required().longOpt("output-pathway-collection"));
            add(Option.builder(OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME).argName("collection-name")
                    .desc(String.format("The name of the output dna seq collection to write to (default: %s)",
                            DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME))
                    .hasArg().required().longOpt("output-dna-seq-collection"));
            add(Option.builder(OPTION_DESIGN_SOME).argName("molecule").desc(
                    "Generate designs for specified molecules; can be a numeric ids, InChIs, or InChI Keys, *separated by '|'* "
                            + "for InChI compatibility.  Molecules must exist in chemical source DB if InChI or InChI Key is used.")
                    .hasArgs().valueSeparator('|').longOpt("design-this"));
        }
    };

    public static final String HELP_MESSAGE = "This class is the driver to extract protein sequences from pathways and construct DNA designs from these proteins.";

    private static final CLIUtil CLI_UTIL = new CLIUtil(ProteinToDNADriver.class, HELP_MESSAGE, OPTION_BUILDERS);

    /**
     * This function gets all protein combinations of a pathway from candidate protein sequences from each reaction on
     * the pathway.
     * @param listOfSetOfProteinSequences A list of sets of candidate protein sequences in the pathway
     * @return A set of all possible combinations of proteins from all the reactions in the pathway.
     */
    public static Set<List<String>> makePermutations(List<Set<String>> listOfSetOfProteinSequences) {
        Set<List<String>> accum = new HashSet<>();
        makePermutationsHelper(accum, listOfSetOfProteinSequences, new ArrayList<>());
        return accum;
    }

    private static void makePermutationsHelper(Set<List<String>> accum, List<Set<String>> input,
            List<String> prefix) {
        // Base case: no more sequences to add.  Accumulate and return.
        if (input.isEmpty()) {
            accum.add(prefix);
            return;
        }

        // Recursive case: iterate through next level of input sequences, appending each to a prefix and recurring.
        Set<String> head = input.get(0);
        // Avoid index out of bounds exception.
        List<Set<String>> rest = input.size() > 1 ? input.subList(1, input.size()) : Collections.emptyList();
        for (String next : head) {
            List<String> newPrefix = new ArrayList<>(prefix);
            newPrefix.add(next);
            makePermutationsHelper(accum, rest, newPrefix);
        }
    }

    public static void main(String[] args) throws Exception {
        CommandLine cl = CLI_UTIL.parseCommandLine(args);

        String reactionDbName = cl.getOptionValue(OPTION_INPUT_DB_NAME, DEFAULT_INPUT_DB_NAME);
        String dbHost = cl.getOptionValue(OPTION_DB_HOST, DEFAULT_DB_HOST);
        Integer dbPort = Integer.valueOf(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_DB_PORT));
        MongoDB reactionDB = new MongoDB(dbHost, dbPort, reactionDbName);

        MongoClient inputClient = new MongoClient(new ServerAddress(dbHost, dbPort));
        String outDB = cl.getOptionValue(OPTION_OUTPUT_DB_NAME, DEFAULT_OUTPUT_DB_NAME);
        DB db = inputClient.getDB(outDB);

        String inputPathwaysCollectionName = cl.getOptionValue(OPTION_INPUT_PATHWAY_COLLECTION_NAME,
                DEFAULT_INPUT_PATHWAY_COLLECTION_NAME);
        String outputPathwaysCollectionName = cl.getOptionValue(OPTION_OUTPUT_PATHWAY_COLLECTION_NAME,
                DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME);
        String outputDnaDeqCollectionName = cl.getOptionValue(OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME,
                DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME);

        JacksonDBCollection<ReactionPath, String> inputPathwayCollection = JacksonDBCollection
                .wrap(db.getCollection(inputPathwaysCollectionName), ReactionPath.class, String.class);
        JacksonDBCollection<DNADesign, String> dnaDesignCollection = JacksonDBCollection
                .wrap(db.getCollection(outputDnaDeqCollectionName), DNADesign.class, String.class);
        JacksonDBCollection<ReactionPath, String> outputPathwayCollection = JacksonDBCollection
                .wrap(db.getCollection(outputPathwaysCollectionName), ReactionPath.class, String.class);

        Map<String, Set<ProteinInformation>> proteinSeqToOrgInfo = new HashMap<>();

        ProteinsToDNA2 p2d = ProteinsToDNA2.initiate();

        List<Long> reachableIds = cl.hasOption(OPTION_DESIGN_SOME)
                ? Arrays.stream(cl.getOptionValues(OPTION_DESIGN_SOME)).map(m -> lookupMolecule(reactionDB, m))
                        .collect(Collectors.toList())
                : Collections.emptyList();

        // If there is a list of molecules for which to create designs, only consider their pathways.  Otherwise, find all.
        DBObject query = reachableIds.isEmpty() ? new BasicDBObject() : DBQuery.in("target", reachableIds);

        /* Extract all pathway ids, then read pathways one id at a time.  This will reduce the likelihood of the cursor
         * timing out, which has a tendency to happen when doing expensive operations before advancing (as done here). */
        DBCursor<ReactionPath> cursor = inputPathwayCollection.find(query, new BasicDBObject("_id", true));
        List<String> pathwayIds = new ArrayList<>();
        while (cursor.hasNext()) {
            pathwayIds.add(cursor.next().getId());
        }

        for (String pathwayId : pathwayIds) {
            ReactionPath reactionPath = inputPathwayCollection.findOne(DBQuery.is("_id", pathwayId));

            List<List<Pair<ProteinMetadata, Integer>>> processedP = RankPathway
                    .processSinglePathAsJava(reactionPath, reactionDbName, outDB);
            if (processedP == null) {
                LOGGER.info(String.format(
                        "Process pathway was filtered out possibly because there were more than %s seqs for a given pathway",
                        RankPathway.MAX_PROTEINS_PER_PATH()));
                outputPathwayCollection.insert(reactionPath);
                continue;
            }

            Boolean atleastOneSeqMissingInPathway = false;
            List<Set<String>> proteinPaths = new ArrayList<>();

            for (Cascade.NodeInformation nodeInformation : reactionPath.getPath().stream()
                    .filter(nodeInfo -> nodeInfo.getIsReaction()).collect(Collectors.toList())) {

                Set<String> proteinSeqs = new HashSet<>();

                for (Long id : nodeInformation.getReactionIds()) {

                    // If the id is negative, it is a reaction in the reverse direction. Moreover, the enzyme for this reverse
                    // reaction is the same, so can use the actual positive reaction id's protein seq reference.
                    // TODO: Add a preference for the positive forward direction compared to the negative backward direction seq.
                    if (id < 0) {
                        LOGGER.info("Found a negative reaction id", id);
                        id = Reaction.reverseID(id);
                    }

                    Reaction reaction = reactionDB.getReactionFromUUID(id);

                    for (JSONObject data : reaction.getProteinData()) {
                        // Get the sequences
                        if (data.has("sequences")) {
                            JSONArray seqs = data.getJSONArray("sequences");

                            for (int i = 0; i < seqs.length(); i++) {
                                Long s = seqs.getLong(i);

                                if (s != null) {
                                    Seq sequenceInfo = reactionDB.getSeqFromID(s);
                                    String dnaSeq = sequenceInfo.getSequence();

                                    if (dnaSeq == null) {
                                        LOGGER.info(String.format(
                                                "Sequence string for seq id %d, reaction id %d and reaction path %s is null",
                                                s, id, reactionPath.getId()));
                                        continue;
                                    }

                                    // odd sequence
                                    if (dnaSeq.length() <= 80 || dnaSeq.charAt(0) != 'M') {
                                        JSONObject metadata = sequenceInfo.getMetadata();

                                        if (!metadata.has("inferred_sequences")
                                                || metadata.getJSONArray("inferred_sequences").length() == 0) {
                                            continue;
                                        }

                                        JSONArray inferredSequences = metadata.getJSONArray("inferred_sequences");

                                        // get the first inferred sequence since it has the highest hmmer score
                                        JSONObject object = inferredSequences
                                                .getJSONObject(HIGHEST_SCORING_INFERRED_SEQ_INDEX);

                                        for (String blacklistWord : BLACKLISTED_WORDS_IN_INFERRED_SEQ) {
                                            if (object.getString("fasta_header").contains(blacklistWord)) {
                                                continue;
                                            }
                                        }

                                        dnaSeq = object.getString("sequence");
                                    }

                                    proteinSeqs.add(dnaSeq);
                                    ProteinInformation proteinInformation = new ProteinInformation(
                                            sequenceInfo.getOrgName(), sequenceInfo.getEc(),
                                            sequenceInfo.getSequence(), reaction.getReactionName());

                                    if (!proteinSeqToOrgInfo.containsKey(dnaSeq)) {
                                        proteinSeqToOrgInfo.put(dnaSeq, new HashSet<>());
                                    }
                                    proteinSeqToOrgInfo.get(dnaSeq).add(proteinInformation);
                                }
                            }
                        }
                    }
                }

                if (proteinSeqs.size() == 0) {
                    LOGGER.info("The reaction does not have any viable protein sequences");
                    atleastOneSeqMissingInPathway = true;
                    break;
                }

                // Now we select two representative protein seqs from the reaction. In order to do this deterministically,
                // we sort and pick the first and middle index protein seqs.
                List<String> proteinSeqArray = new ArrayList<>(proteinSeqs);
                Collections.sort(proteinSeqArray);

                int firstIndex = 0;
                int middleIndex = proteinSeqs.size() / 2;

                // get first seq
                Set<String> combination = new HashSet<>();
                combination.add(proteinSeqArray.get(firstIndex));

                // get middle index of the protein seq array
                if (proteinSeqs.size() > 1) {
                    combination.add(proteinSeqArray.get(middleIndex));
                }

                proteinPaths.add(combination);
            }

            if (atleastOneSeqMissingInPathway) {
                LOGGER.info(String.format("There is at least one reaction with no sequence in reaction path id: %s",
                        reactionPath.getId()));
            } else {
                LOGGER.info(String.format("All reactions in reaction path have at least one viable seq: %s",
                        reactionPath.getId()));

                // We only compute the dna design if we can find at least one sequence for each reaction in the pathway.
                Set<List<String>> pathwayProteinCombinations = makePermutations(proteinPaths);
                Set<DNAOrgECNum> dnaDesigns = new HashSet<>();

                for (List<String> proteinsInPathway : pathwayProteinCombinations) {
                    try {
                        Construct dna = p2d.computeDNA(proteinsInPathway, Host.Ecoli);

                        List<Set<ProteinInformation>> seqMetadata = new ArrayList<>();
                        for (String protein : proteinsInPathway) {
                            seqMetadata.add(proteinSeqToOrgInfo.get(protein));
                        }

                        DNAOrgECNum instance = new DNAOrgECNum(dna.toSeq(), seqMetadata, proteinsInPathway.size());
                        dnaDesigns.add(instance);
                    } catch (Exception ex) {
                        LOGGER.error("The error thrown while trying to call computeDNA", ex.getMessage());
                    }
                }

                DNADesign dnaDesignSeq = new DNADesign(dnaDesigns);
                try {
                    WriteResult<DNADesign, String> result = dnaDesignCollection.insert(dnaDesignSeq);
                    String id = result.getSavedId();
                    reactionPath.setDnaDesignRef(id);
                } catch (MongoInternalException e) {
                    // This condition happens whent the protein designs are too big and cannot be inserted in to the JSON object.
                    // TODO: Handle this case without dropping the record
                    LOGGER.error(String.format("Mongo internal exception caught while inserting dna design: %s",
                            e.getMessage()));
                }

            }

            outputPathwayCollection.insert(reactionPath);
        }
    }

    private static Long lookupMolecule(MongoDB db, String someKey) {
        if (REGEX_ID.matcher(someKey).find()) {
            // Note: this doesn't verify that the chemical id is valid.  Maybe we should do that?
            return Long.valueOf(someKey);
        } else if (REGEX_INCHI.matcher(someKey).find()) {
            Chemical c = db.getChemicalFromInChI(someKey);
            if (c != null) {
                return c.getUuid();
            }
        } else if (REGEX_INCHI_KEY.matcher(someKey).find()) {
            Chemical c = db.getChemicalFromInChIKey(someKey);
            if (c != null) {
                return c.getUuid();
            }
        } else {
            String msg = String.format("Unable to find key type for query '%s'", someKey);
            LOGGER.error(msg);
            throw new IllegalArgumentException(msg);
        }
        String msg = String.format("Unable to find matching chemical for query %s", someKey);
        LOGGER.error(msg);
        throw new IllegalArgumentException(msg);
    }
}