gov.jgi.meta.exec.BlastCommand.java Source code

Java tutorial

Introduction

Here is the source code for gov.jgi.meta.exec.BlastCommand.java

Source

/*
 * Copyright (c) 2010, The Regents of the University of California, through Lawrence Berkeley
 * National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy).
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided
 * that the following conditions are met:
 *
 * (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the
 * following disclaimer.
 *
 * (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions
 * and the following disclaimer in the documentation and/or other materials provided with the distribution.
 *
 * (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept.
 * of Energy, nor the names of its contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the
 * features, functionality or performance of the source code ("Enhancements") to anyone; however,
 * if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley
 * National Laboratory, without imposing a separate written license agreement for such Enhancements,
 * then you hereby grant the following license: a  non-exclusive, royalty-free perpetual license to install,
 * use, modify, prepare derivative works, incorporate into other computer software, distribute, and
 * sublicense such enhancements or derivative works thereof, in binary and source code form.
 */

package gov.jgi.meta.exec;

import gov.jgi.meta.MetaUtils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

import com.devdaily.system.SystemCommandExecutor;

/**
 * class that wraps execution of commandline Blast program.
 *
 * Use this by creating a new BlastCommand object, then running exec. IE:
 *    blastCmd = new BlastCommand(context.getConfiguration());
 *    s = blastCmd.exec(value, geneDBFilePath);
 *    blastCmd.cleanUp();
 *
 * value is a MAP<STRING, STRING> of <readids, sequences> and geneDBFilePath
 * is a file path in hdfs.  this object creates local files, then does a
 * system.exec.  stdout is returned as a set of strings representing each
 * line in the output.
 *
 */
public class BlastCommand {
    String DEFAULTCOMMANDLINE = "-m 8 -p tblastn -b 1000000 -a 10";
    //   String DEFAULTCOMMANDPATH = "/home/asczyrba/src/blast-2.2.20/bin/blastall";
    String DEFAULTCOMMANDPATH = "blastall";
    String DEFAULTTMPDIR = "/tmp/blast";

    String DEFAULTFORMATDBCOMMANDLINE = "-o T -p F";
    String DEFAULTFORMATDBCOMMANDPATH = "formatdb"; // by default it should be in the path

    // -p program name
    // -m 8 alignment view options - tabular
    // -b 1000000 number of sequences to show alignments of (max number)
    // -a 10 number of processors to use
    // -o output file
    // -d blast database directory where formatdb was run
    // -i input sequence

    /**
     * logger
     */
    Logger log = Logger.getLogger(BlastCommand.class);

    /**
     * the commandline to execute (all options except the input/output)
     */
    String commandLine = DEFAULTCOMMANDLINE;

    /**
     * the location of the executable in the filesystem
     */
    String commandPath = DEFAULTCOMMANDPATH;

    String formatdbCommandLine = DEFAULTFORMATDBCOMMANDLINE;
    String formatdbCommandPath = DEFAULTFORMATDBCOMMANDPATH;

    /**
     * temporary directory to use for intermediate files
     */
    String tmpDir = DEFAULTTMPDIR;
    File tmpDirFile = null;

    public StringBuffer commandString = null;

    /**
     * the contents of the stdout after executing the command
     */
    String stdout = null;

    /**
     * the contents of the stderr after executing the command
     */
    String stderr = null;

    /**
     * the shell return value after executing the command
     */
    int exitValue = 0;

    /**
     * flag to leave working directories along (if false) or to remove
     * them after execution (if true)
     */
    Boolean docleanup = true;

    /**
     * the effective blast database size
     */
    long effectiveSize = 0;

    long databasesize = 0;

    Boolean useScaledEValue = false;
    Boolean useEffectiveSize = false;
    float useEValue = 10;

    /**
     * new blast command based on default parameters
     */
    public BlastCommand() throws IOException {
        tmpDirFile = MetaUtils.createTempDir("blast_", tmpDir);
    }

    /**
     * new blast command based on values stored in the configuration.
     * <p/>
     * Looks for the following config values: blast.commandline,
     * blast.commandpath, and blast.tmpdir, blast.cleanup
     *
     * @param config is the hadoop configuration with overriding values
     *               for commandline options and paths
     * @throws IOException if executable can not be found
     */
    public BlastCommand(Configuration config) throws IOException {
        log.info("initializing");
        String c;

        log.info("initializing new blast command");

        if ((c = config.get("blast.commandline")) != null) {
            commandLine = c;
        }
        if ((c = config.get("blast.commandpath")) != null) {
            commandPath = c;
        }
        if ((c = config.get("formatdb.commandline")) != null) {
            formatdbCommandLine = c;
        }
        if ((c = config.get("formatdb.commandpath")) != null) {
            formatdbCommandPath = c;
        }

        if ((c = config.get("blast.tmpdir")) != null) {
            tmpDir = c;
        }

        docleanup = config.getBoolean("blast.cleanup", true);

        effectiveSize = config.getLong("blast.effectivedatabasesize", 0);
        useScaledEValue = config.getBoolean("blast.usescaledevalue", false);
        useEffectiveSize = config.getBoolean("blast.useeffectivesize", false);
        useEValue = config.getFloat("blast.useevalue", 10F);

        /*
         * do sanity check to make sure all paths exist
         */
        checkFileExists(commandLine);
        checkFileExists(commandPath);
        checkDirExists(tmpDir);

        /*
         * if all is good, create a working space inside tmpDir
         */

        tmpDirFile = MetaUtils.createTempDir("blast_", tmpDir);

        log.info("done initializing: tmp dir = " + tmpDirFile);
    }

    public File getTmpDir() {
        return tmpDirFile;
    }

    private int checkFileExists(String filePath) throws IOException {
        // TODO: fill out this function
        return (0);
    }

    private int checkDirExists(String filePath) throws IOException {
        // TODO: fill out this function
        return (0);
    }

    /**
     * destructor deletes the tmp space if it was created
     *
     * @throws Throwable
     */
    protected void finalize() throws Throwable {
        /*
         * delete the tmp files if they exist
         */
        this.cleanup();

        super.finalize();
    }

    /**
     * clean up any tmp files
     */
    public void cleanup() {
        log.info("cleaning up");
        if (this.docleanup) {
            if (tmpDirFile != null) {
                MetaUtils.recursiveDelete(tmpDirFile);
                tmpDirFile = null;
            }
        }
        log.info("finished cleaning up");
    }

    /**
     * given a list of sequences, creates a db for use with blast using
     * formatdb executable
     *
     * @param seqList is the list of sequences to create the database with
     * @return the full path of the location of the database
     * @throws IOException if it can't read or write to the files
     * @throws InterruptedException if system.exec is interrupted.
     */
    private String execFormatDB(Map<String, String> seqList) throws IOException, InterruptedException {
        log.info("running formatdb");
        BufferedWriter out;

        log.debug("blastcmd: formating the sequence db using formatdb");

        /*
         * open temp file
         */
        log.debug("blastcmd: using temp directory " + tmpDirFile.getPath());
        out = new BufferedWriter(new FileWriter(tmpDirFile.getPath() + "/seqfile"));

        /*
         *  write out the sequences to file
         */
        for (String key : seqList.keySet()) {
            assert (seqList.get(key) != null);
            out.write(">" + key + "\n");
            out.write(seqList.get(key) + "\n");
        }

        /*
         *  close temp file
         */
        out.close();

        databasesize = new File(tmpDirFile.getPath() + "/seqfile").length();

        log.debug("blastcmd: done writing sequence file");

        /*
         * execute formatdb command
         */

        List<String> commands = new ArrayList<String>();
        File seqFile = new File(tmpDirFile, "seqfile");
        commands.add("/bin/sh");
        commands.add("-c");
        commands.add(
                "export TMPDIR=" + tmpDirFile.getPath() + "; cd " + tmpDirFile.getPath() + ";" + formatdbCommandPath
                        + " " + formatdbCommandLine + " -i " + seqFile.getPath() + " -n " + "seqfile");

        log.info("blastcmd: formatdbcommand = " + commands);

        SystemCommandExecutor commandExecutor = new SystemCommandExecutor(commands);
        int r = commandExecutor.executeCommand();

        log.debug("return value = " + r);
        log.debug("stdout = " + commandExecutor.getStandardOutputFromCommand().toString());
        log.debug("stderr = " + commandExecutor.getStandardErrorFromCommand().toString());

        log.info("done with formatdb, exit value = " + r);

        return (seqFile.getPath());
    }

    /**
     * copies a file from DFS to local working directory
     *
     * @param dfsPath is the pathname to a file in DFS
     * @return the path of the new file in local scratch space
     * @throws IOException if it can't access the files
     */
    private String copyDBFile(String dfsPath) throws IOException {
        log.info("copying database to file");

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        Path filenamePath = new Path(dfsPath);
        File localFile = new File(tmpDirFile, filenamePath.getName());

        if (!fs.exists(filenamePath)) {
            throw new IOException("file not found: " + dfsPath);
        }

        FSDataInputStream in = fs.open(filenamePath);
        BufferedReader d = new BufferedReader(new InputStreamReader(in));

        BufferedWriter out = new BufferedWriter(new FileWriter(localFile.getPath()));

        String line;
        line = d.readLine();

        while (line != null) {
            out.write(line + "\n");
            line = d.readLine();
        }
        in.close();
        out.close();

        log.info("done with data copy");
        return (localFile.getPath());
    }

    /**
     * execute the blast command and return a list of sequence ids that match
     *
     * @param seqMap is the key/value map of sequences that act as reference keyed by name
     * @param cazyEC  is the full path of the cazy database to search against the reference
     * @return a list of sequence ids in the reference that match the cazy database
     * @throws IOException if it can't access files for whatever reason,
     * @throws InterruptedException if system.exec is interrupted
     */
    public Set<String> exec(Map<String, String> seqMap, String cazyEC) throws IOException, InterruptedException {
        log.info("starting exec");
        String seqDir = execFormatDB(seqMap);
        String localCazyEC = copyDBFile(cazyEC);

        if (seqDir == null) {
            /*
             * didn't run formatdb, so return with fail
             */
            throw new IOException("blast can't run... formatdb did not process the inputs");
        }

        List<String> commands = new ArrayList<String>();
        commands.add("/bin/sh");
        commands.add("-c");

        commandString = new StringBuffer();

        commandString.append("cd " + tmpDirFile.getPath() + "; " + commandPath + " " + commandLine);
        if (this.useEffectiveSize) {
            commandString.append(" -z " + effectiveSize);
        }
        double ecutoff = this.useEValue;
        if (this.useScaledEValue) {
            ecutoff = (ecutoff * databasesize / effectiveSize);
        }
        //commandString.append(" -e " + ecutoff);
        commandString.append(" -d " + seqDir + " -i " + localCazyEC);

        log.info("command = " + commandString.toString());

        commands.add(commandString.toString());

        // TODO: remove the try statement to throw exception in case of failure

        SystemCommandExecutor commandExecutor = new SystemCommandExecutor(commands);
        exitValue = commandExecutor.executeCommand();

        // stdout and stderr of the command are returned as StringBuilder objects
        stdout = commandExecutor.getStandardOutputFromCommand().toString();
        stderr = commandExecutor.getStandardErrorFromCommand().toString();

        File blast_output = new File(tmpDirFile.getPath() + "/blast_output.txt");
        FileUtils.writeStringToFile(blast_output, stdout);

        log.debug("exit = " + exitValue);
        //         log.debug("stdout = " + stdout);
        log.debug("stderr = " + stderr);

        if (exitValue != 0) {
            log.error("blast executable exit value = " + exitValue + ": " + stderr);
            throw new IOException("blast executable exit value = " + exitValue + ": " + stderr);
        }

        /*
         * now parse the output
         */
        String[] lines = stdout.split("\n");
        Set<String> s = new HashSet<String>();

        s.addAll(Arrays.asList(lines));

        log.info("done with exec, exit value = " + exitValue);
        return (s);
    }

}