com.act.biointerpretation.sars.SeqDBReactionGrouper.java Source code

Java tutorial

Introduction

Here is the source code for com.act.biointerpretation.sars.SeqDBReactionGrouper.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.sars;

import act.server.MongoDB;
import act.shared.Seq;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * A sequence grouper that iterates over the seq DB and groups only seq entries that have exactly same sequence.
 */
public class SeqDBReactionGrouper {

    private static final Logger LOGGER = LogManager.getFormatterLogger(SeqDBReactionGrouper.class);

    private static final String OPTION_DB = "db";
    private static final String OPTION_OUTPUT_PATH = "o";
    private static final String OPTION_LIMIT = "l";
    private static final String OPTION_HELP = "h";

    public static final String HELP_MESSAGE = "This class is used to generate reaction groups by scanning the seq DB for sequences that point to multiple "
            + "reactions.  Options are supplied to indicate how far into the DB to scan, which DB to use, and where to "
            + "write the output.";

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_DB).argName("db name").desc("The name of the mongo DB to use.").hasArg()
                    .longOpt("db-name").required(true));
            add(Option.builder(OPTION_OUTPUT_PATH).argName("output file path").desc(
                    "The absolute path to the file to which to write the json file of the reaction group corpus.")
                    .hasArg().longOpt("output-file-path").required(true));
            add(Option.builder(OPTION_LIMIT).argName("seq limit").desc(
                    "The maximum number of seq entries to process. This is useful because running on the entire DB can "
                            + "require a lot of time and memory.")
                    .hasArg().longOpt("seq-limit").type(Integer.class));
            add(Option.builder(OPTION_HELP).argName("help").desc("Prints this help message.").longOpt("help"));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    private static final String LOCAL_HOST = "localhost";
    private static final Integer MONGO_PORT = 27017;
    private static final Integer DEFAULT_LIMIT_INFINITY = Integer.MAX_VALUE;

    public static void main(String[] args) throws Exception {
        // Build command line parser.
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            LOGGER.error("Argument parsing failed: %s", e.getMessage());
            HELP_FORMATTER.printHelp(SeqDBReactionGrouper.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        // Print help.
        if (cl.hasOption(OPTION_HELP)) {
            HELP_FORMATTER.printHelp(SeqDBReactionGrouper.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        // Handle arguments
        String mongoDBName = cl.getOptionValue(OPTION_DB);
        MongoDB mongoDB = new MongoDB(LOCAL_HOST, MONGO_PORT, mongoDBName);

        File outputFile = new File(cl.getOptionValue(OPTION_OUTPUT_PATH));
        if (outputFile.isDirectory() || outputFile.exists()) {
            LOGGER.error("Supplied output file is a directory or already exists.");
            System.exit(1);
        }
        outputFile.createNewFile();

        Integer limit = DEFAULT_LIMIT_INFINITY;
        if (cl.hasOption(OPTION_LIMIT)) {
            limit = Integer.parseInt(cl.getOptionValue(OPTION_LIMIT));
        }
        LOGGER.info("Only processing first %d entries in Seq DB.", limit);

        SeqDBReactionGrouper enzymeGrouper = new SeqDBReactionGrouper(mongoDB.getSeqIterator(), mongoDBName, limit);

        LOGGER.info("Scanning seq db for reactions with same seq.");
        ReactionGroupCorpus groupCorpus = enzymeGrouper.getReactionGroupCorpus();

        LOGGER.info("Writing output to file.");
        groupCorpus.printToJsonFile(outputFile);

        LOGGER.info("Complete!");
    }

    final Integer limit;
    final String dbName;
    final Iterator<Seq> seqIterator;

    /**
     * Builds a SeqDBReactionGrouper for the given Seq entries.
     *
     * @param seqIterator The Seq entries to group.
     * @param limit The maximum number of entries to process. This can be used to limit memory and time.
     */
    public SeqDBReactionGrouper(Iterator<Seq> seqIterator, String dbName, Integer limit) {
        this.seqIterator = seqIterator;
        this.dbName = dbName;
        this.limit = limit;
    }

    /**
     * Builds a SeqDBReactionGrouper for the given Seq entries.
     *
     * @param seqIterator The Seq entries to group.
     */
    public SeqDBReactionGrouper(Iterator<Seq> seqIterator, String dbName) {

        this(seqIterator, dbName, DEFAULT_LIMIT_INFINITY);
    }

    /**
     * Returns the collection of SeqGroups produced by running this grouper on the Seq entries from the DB.
     * TODO: Implement this in a way that doesn't store the whole map in memory at the same time.
     *
     * @return The collection of produced SeqGroups.
     */

    public ReactionGroupCorpus getReactionGroupCorpus() {
        Map<String, ReactionGroup> sequenceToReactionGroupMap = getSequenceToReactionGroupMap(seqIterator);
        LOGGER.info("Done getting seq group map, found %d distinct SeqGroups.", sequenceToReactionGroupMap.size());
        return new ReactionGroupCorpus(sequenceToReactionGroupMap.values());
    }

    /**
     * Iterates over seq entries and builds a map from unique sequences to ReactionGroup objects that list their
     * corresponding Seq entry ids and Reaction ids.
     *
     * @param seqIterator
     * @return
     */
    private Map<String, ReactionGroup> getSequenceToReactionGroupMap(Iterator<Seq> seqIterator) {
        Map<String, ReactionGroup> sequenceToReactionGroupMap = new HashMap<>();

        Integer counter = 0;
        while (seqIterator.hasNext()) {
            if (counter >= limit) {
                break;
            }
            if (counter % 1000 == 0) {
                LOGGER.info("Processed %d seq entries so far", counter);
            }

            Seq seq = seqIterator.next();
            String sequence = seq.getSequence();

            ReactionGroup group = sequenceToReactionGroupMap.get(sequence);

            if (group == null) {
                group = new ReactionGroup("SEQ_ID_" + Integer.toString(seq.getUUID()), dbName);
                sequenceToReactionGroupMap.put(sequence, group);
            }

            for (Long reactionId : seq.getReactionsCatalyzed()) {
                group.addReactionId(reactionId);
            }
            counter++;
        }

        return sequenceToReactionGroupMap;
    }
}