ubic.gemma.apps.Blat.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.apps.Blat.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.apps;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.Socket;
import java.net.UnknownHostException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.gemma.analysis.sequence.SequenceManipulation;
import ubic.gemma.analysis.sequence.SequenceWriter;
import ubic.gemma.loader.genome.BlatResultParser;
import ubic.gemma.model.common.description.DatabaseType;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResult;
import ubic.gemma.util.ConfigUtils;
import ubic.gemma.util.TimeUtil;
import ubic.gemma.util.concurrent.GenericStreamConsumer;

/**
 * Class to manage the gfServer and run BLAT searches.
 * 
 * @author pavlidis
 * @version $Id: Blat.java,v 1.45 2012/06/18 22:58:34 paul Exp $
 */
public class Blat {

    public static enum BlattableGenome {
        HUMAN, MOUSE, RAT
    }

    /**
     * Minimum alignment length for retention.
     */
    private static final int MIN_SCORE = 16;

    /**
     * This value is basically a threshold fraction of aligned bases in the query. Hits below this score are simply not
     * reported. {@link BlatResult} has implementation of score computation.
     * 
     * @see BlatResult
     */
    public static final double DEFAULT_BLAT_SCORE_THRESHOLD = 0.7;

    public static final double STEPSIZE = 7;

    private static final int BLAT_UPDATE_INTERVAL_MS = 1000 * 30;

    private static final Log log = LogFactory.getLog(Blat.class);

    private static String os = System.getProperty("os.name").toLowerCase();

    /**
     * Strings of As or Ts at the start or end of a sequence longer than this will be stripped off prior to analysis.
     */
    private static final int POLY_AT_THRESHOLD = 5;

    private static boolean hasNativeLibrary;

    static {
        if (!os.toLowerCase().startsWith("windows")) {
            try {
                log.debug("Loading gfClient library, looking in " + System.getProperty("java.library.path"));
                System.loadLibrary("Blat");
                log.info("Loaded Blat native library successfully");
                hasNativeLibrary = true;
            } catch (UnsatisfiedLinkError e) {
                log.warn("Unable to locate the native Blat library. "
                        + "This isn't a problem if you have the Blat binaries installed");
                hasNativeLibrary = false;
            }
        } else {
            hasNativeLibrary = false;
        }
    }

    /**
     * @param taxon
     * @return
     */
    public static ExternalDatabase getSearchedGenome(Taxon taxon) {
        BlattableGenome genome = inferBlatDatabase(taxon);
        ExternalDatabase searchedDatabase = ExternalDatabase.Factory.newInstance();
        searchedDatabase.setType(DatabaseType.SEQUENCE);
        searchedDatabase.setName(genome.toString().toLowerCase());
        return searchedDatabase;
    }

    /**
     * @param taxon
     * @return
     */
    private static BlattableGenome inferBlatDatabase(Taxon taxon) {
        assert taxon != null;

        BlattableGenome bg = BlattableGenome.MOUSE;

        if (taxon.getNcbiId() == 10090 || taxon.getCommonName().equals("mouse")) {
            bg = BlattableGenome.MOUSE;
        } else if (taxon.getNcbiId() == 10116 || taxon.getCommonName().equals("rat")) {
            bg = BlattableGenome.RAT;
        } else if (taxon.getNcbiId() == 9606 || taxon.getCommonName().equals("human")) {
            bg = BlattableGenome.HUMAN;
        } else {
            throw new UnsupportedOperationException("Cannot determine which database to search for " + taxon);
        }
        return bg;
    }

    private double blatScoreThreshold = DEFAULT_BLAT_SCORE_THRESHOLD;
    private boolean doShutdown = true;

    // typical values.
    private String gfClientExe = "/cygdrive/c/cygwin/usr/local/bin/gfClient.exe";
    private String gfServerExe = "/cygdrive/c/cygwin/usr/local/bin/gfServer.exe";
    private String host = "localhost";
    private String seqDir = "/";

    private String humanSeqFiles;
    private String ratSeqFiles;
    private String mouseSeqFiles;

    private Process serverProcess;

    private int humanServerPort;

    private int mouseServerPort;

    private int ratServerPort;

    private int humanSensitiveServerPort;

    private int mouseSensitiveServerPort;

    // private String humanServerHost;
    //
    // private String mouseServerHost;
    //
    // private String ratServerHost;

    private int ratSensitiveServerPort;

    /**
     * Create a blat object with settings read from the config file.
     */
    public Blat() {
        try {
            init();
        } catch (ConfigurationException e) {
            throw new RuntimeException("Could not load configuration", e);
        }
    }

    /**
     * @param host
     * @param port
     * @param seqDir
     */
    public Blat(String host, int humanServerPort, String seqDir) {

        if (host == null || humanServerPort <= 0 || seqDir == null)
            throw new IllegalArgumentException("All values must be non-null");
        this.host = host;
        this.humanServerPort = humanServerPort;
        this.seqDir = seqDir;
    }

    /**
     * Run a BLAT search using the gfClient.
     * 
     * @param b. The genome is inferred from the Taxon held by the sequence.
     * @return Collection of BlatResult objects.
     * @throws IOException
     */
    public Collection<BlatResult> blatQuery(BioSequence b) throws IOException {
        Taxon t = b.getTaxon();
        if (t == null) {
            throw new IllegalArgumentException("Cannot blat sequence unless taxon is given or inferrable");
        }

        return blatQuery(b, t, false);
    }

    /**
     * Run a BLAT search using the gfClient.
     * 
     * @param b
     * @param genome
     * @param sensitive if true use the more sensitive gfServer, if available.
     * @return Collection of BlatResult objects.
     * @throws IOException
     */
    public Collection<BlatResult> blatQuery(BioSequence b, Taxon taxon, boolean sensitive) throws IOException {
        assert seqDir != null;
        // write the sequence to a temporary file.
        String seqName = b.getName().replaceAll(" ", "_");
        File querySequenceFile = File.createTempFile(seqName, ".fa");

        BufferedWriter out = new BufferedWriter(new FileWriter(querySequenceFile));
        String trimmed = SequenceManipulation.stripPolyAorT(b.getSequence(), POLY_AT_THRESHOLD);
        out.write(">" + seqName + "\n" + trimmed);
        out.close();
        log.info("Wrote sequence to " + querySequenceFile.getPath());

        String outputPath = getTmpPslFilePath(seqName);

        Collection<BlatResult> results = gfClient(querySequenceFile, outputPath,
                choosePortForQuery(taxon, sensitive));

        ExternalDatabase searchedDatabase = getSearchedGenome(taxon);
        for (BlatResult result : results) {
            result.setSearchedDatabase(searchedDatabase);
        }

        cleanUpTmpFiles(querySequenceFile, outputPath);
        return results;

    }

    /**
     * @param sequences
     * @param taxon The taxon whose database will be searched.
     * @return map of the input sequences to a corresponding collection of blat result(s)
     * @throws IOException
     */
    public Map<BioSequence, Collection<BlatResult>> blatQuery(Collection<BioSequence> sequences, boolean sensitive,
            Taxon taxon) throws IOException {
        Map<BioSequence, Collection<BlatResult>> results = new HashMap<BioSequence, Collection<BlatResult>>();

        File querySequenceFile = File.createTempFile("sequences-for-blat", ".fa");
        int count = SequenceWriter.writeSequencesToFile(sequences, querySequenceFile);
        if (count == 0) {
            querySequenceFile.delete();
            throw new IllegalArgumentException("No sequences!");
        }

        String outputPath = getTmpPslFilePath("blat-output");

        Integer port = choosePortForQuery(taxon, sensitive);

        if (port == null) {
            throw new IllegalStateException("Could not locate port for BLAT with settings taxon=" + taxon
                    + ", sensitive=" + sensitive + ", check your configuration.");
        }

        Collection<BlatResult> rawresults = gfClient(querySequenceFile, outputPath, port);

        log.info("Got " + rawresults.size() + " raw blat results");

        ExternalDatabase searchedDatabase = getSearchedGenome(taxon);

        for (BlatResult blatResult : rawresults) {
            blatResult.setSearchedDatabase(searchedDatabase);

            BioSequence query = blatResult.getQuerySequence();

            if (!results.containsKey(query)) {
                results.put(query, new HashSet<BlatResult>());
            }

            results.get(query).add(blatResult);
        }
        querySequenceFile.delete();
        return results;
    }

    public Map<BioSequence, Collection<BlatResult>> blatQuery(Collection<BioSequence> sequences, Taxon taxon)
            throws IOException {
        return blatQuery(sequences, false, taxon);
    }

    /**
     * @return the blatScoreThreshold
     */
    public double getBlatScoreThreshold() {
        return this.blatScoreThreshold;
    }

    /**
     * @return Returns the gfClientExe.
     */
    public String getGfClientExe() {
        return this.gfClientExe;
    }

    /**
     * @return Returns the gfServerExe.
     */
    public String getGfServerExe() {
        return this.gfServerExe;
    }

    /**
     * @return Returns the host.
     */
    public String getHost() {
        return this.host;
    }

    /**
     * @return Returns the humanServerPort.
     */
    public int getHumanServerPort() {
        return this.humanServerPort;
    }

    /**
     * @return Returns the mouseServerPort.
     */
    public int getMouseServerPort() {
        return this.mouseServerPort;
    }

    /**
     * @return Returns the ratServerPort.
     */
    public int getRatServerPort() {
        return this.ratServerPort;
    }

    /**
     * @return Returns the seqDir.
     */
    public String getSeqDir() {
        return this.seqDir;
    }

    /**
     * @return Returns the seqFiles.
     */
    public String getSeqFiles(BlattableGenome genome) {
        switch (genome) {
        case HUMAN:
            return this.humanSeqFiles;
        case MOUSE:
            return this.mouseSeqFiles;
        case RAT:
            return this.ratSeqFiles;
        default:
            return this.humanSeqFiles;

        }
    }

    // /**
    // * @param future
    // * @return
    // * @throws InterruptedException
    // * @throws ExecutionException
    // * @throws IOException
    // */
    // private StringBuilder getErrOutput( FutureTask<Process> future ) throws InterruptedException, ExecutionException,
    // IOException {
    // InputStream result = future.get().getErrorStream();
    // BufferedReader br = new BufferedReader( new InputStreamReader( result ) );
    // String l = null;
    // StringBuilder buf = new StringBuilder();
    // while ( ( l = br.readLine() ) != null ) {
    // buf.append( l + "\n" );
    // }
    // br.close();
    // return buf;
    // }

    /**
     * @param inputStream to the Blat output file in psl format
     * @return processed results.
     */
    public Collection<BlatResult> processPsl(InputStream inputStream, Taxon taxon) throws IOException {

        if (inputStream.available() == 0) {
            throw new IOException("No data from the blat output file. Make sure the gfServer is running");
        }

        log.debug("Processing " + inputStream);
        BlatResultParser brp = new BlatResultParser();
        brp.setTaxon(taxon);
        brp.setScoreThreshold(this.blatScoreThreshold);
        brp.parse(inputStream);
        return brp.getResults();
    }

    /**
     * @param blatScoreThreshold the blatScoreThreshold to set
     */
    public void setBlatScoreThreshold(double blatScoreThreshold) {
        this.blatScoreThreshold = blatScoreThreshold;
    }

    /**
     * Start the server, if the port isn't already being used. If the port is in use, we assume it is a gfServer.
     */
    @SuppressWarnings("unused")
    public void startServer(BlattableGenome genome, int port) throws IOException {
        try {
            new Socket(host, port);
            log.info("There is already a server on port " + port);
            this.doShutdown = false;
        } catch (UnknownHostException e) {
            throw new RuntimeException("Unknown host " + host, e);
        } catch (IOException e) {
            String cmd = this.getGfServerExe() + " -canStop -stepSize=" + STEPSIZE + " start " + this.getHost()
                    + " " + port + " " + this.getSeqFiles(genome);
            log.info("Starting gfServer with command " + cmd);
            this.serverProcess = Runtime.getRuntime().exec(cmd, null, new File(this.getSeqDir()));

            try {
                Thread.sleep(100);
                int exit = serverProcess.exitValue();
                if (exit != 0) {
                    throw new IOException("Could not start server");
                }
            } catch (IllegalThreadStateException e1) {
                log.info("Server seems to have started");
            } catch (InterruptedException e1) {

            }

        }
    }

    /**
     * Stop the gfServer, if it was started by this.
     */
    public void stopServer(int port) {
        if (!doShutdown) {
            return;
        }
        log.info("Shutting down gfServer");

        if (serverProcess == null)
            return;
        // serverProcess.destroy();
        try {
            // this doesn't work unless the server was invoked with the option "-canStop"
            Process server = Runtime.getRuntime()
                    .exec(this.getGfServerExe() + " stop " + this.getHost() + " " + port);
            server.waitFor();
            int exit = server.exitValue();
            log.info("Server on port " + port + " shut down with exit value " + exit);
        } catch (InterruptedException e) {
            log.error(e, e);
        } catch (IOException e) {
            log.error(e, e);
        }

    }

    /**
     * @param genome
     * @return
     */
    private Integer choosePortForQuery(Taxon taxon, boolean sensitive) {
        BlattableGenome genome = inferBlatDatabase(taxon);
        if (sensitive) {
            switch (genome) {
            case HUMAN:
                return humanSensitiveServerPort;
            case MOUSE:
                return mouseSensitiveServerPort;
            case RAT:
                return ratSensitiveServerPort;
            default:
                return humanSensitiveServerPort;

            }
        }
        switch (genome) {
        case HUMAN:
            return humanServerPort;
        case MOUSE:
            return mouseServerPort;
        case RAT:
            return ratServerPort;
        default:
            return humanServerPort;

        }
    }

    /**
     * @param querySequenceFile
     * @param outputPath
     */
    private void cleanUpTmpFiles(File querySequenceFile, String outputPath) {
        if (!querySequenceFile.delete() || !(new File(outputPath)).delete()) {
            log.warn("Could not clean up temporary files.");
        }
    }

    /**
     * Run a gfClient query, using a call to exec().
     * 
     * @param querySequenceFile
     * @param outputPath
     * @return
     */
    private Collection<BlatResult> execGfClient(File querySequenceFile, String outputPath, int portToUse)
            throws IOException {
        final String cmd = gfClientExe + " -nohead -minScore=" + MIN_SCORE + " " + host + " " + portToUse + " "
                + seqDir + " " + querySequenceFile.getAbsolutePath() + " " + outputPath;
        log.info(cmd);

        final Process run = Runtime.getRuntime().exec(cmd);

        // to ensure that we aren't left waiting for these streams
        GenericStreamConsumer gscErr = new GenericStreamConsumer(run.getErrorStream());
        GenericStreamConsumer gscIn = new GenericStreamConsumer(run.getInputStream());
        gscErr.start();
        gscIn.start();

        try {

            int exitVal = Integer.MIN_VALUE;

            // wait...
            StopWatch overallWatch = new StopWatch();
            overallWatch.start();

            while (exitVal == Integer.MIN_VALUE) {
                try {
                    exitVal = run.exitValue();
                } catch (IllegalThreadStateException e) {
                    // okay, still
                    // waiting.
                }
                Thread.sleep(BLAT_UPDATE_INTERVAL_MS);
                // I hope this is okay...
                synchronized (outputPath) {
                    File outputFile = new File(outputPath);
                    Long size = outputFile.length();
                    NumberFormat nf = new DecimalFormat();
                    nf.setMaximumFractionDigits(2);
                    String minutes = TimeUtil.getMinutesElapsed(overallWatch);
                    log.info("BLAT output so far: " + nf.format(size / 1024.0) + " kb (" + minutes
                            + " minutes elapsed)");
                }
            }

            overallWatch.stop();
            String minutes = TimeUtil.getMinutesElapsed(overallWatch);
            log.info("Blat took a total of " + minutes + " minutes");

            // int exitVal = run.waitFor();

            log.debug("blat exit value=" + exitVal);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        log.debug("GfClient Success");

        return processPsl(outputPath, null);
    }

    /**
     * Get a temporary file name.
     * 
     * @throws IOException
     */
    private String getTmpPslFilePath(String base) throws IOException {
        File tmpdir = new File(ConfigUtils.getDownloadPath());
        if (StringUtils.isBlank(base)) {
            return File.createTempFile("blat-output", ".psl", tmpdir).getPath();
        }
        return File.createTempFile(base, ".psl", tmpdir).getPath();
    }

    /**
     * @param querySequenceFile
     * @param outputPath
     * @return processed results.
     * @throws IOException
     */
    private Collection<BlatResult> gfClient(File querySequenceFile, String outputPath, int portToUse)
            throws IOException {
        if (hasNativeLibrary)
            return jniGfClientCall(querySequenceFile, outputPath, portToUse);

        return execGfClient(querySequenceFile, outputPath, portToUse);
    }

    /**
     * @param host
     * @param port
     * @param seqDir
     * @param inputFile
     * @param outputFile
     */
    private native void GfClientCall(String h, String p, String dir, String input, String output);

    /**
     * @throws ConfigurationException
     */
    private void init() throws ConfigurationException {

        log.debug("Reading global config");
        this.humanServerPort = ConfigUtils.getInt("gfClient.humanServerPort");
        this.mouseServerPort = ConfigUtils.getInt("gfClient.mouseServerPort");
        this.ratServerPort = ConfigUtils.getInt("gfClient.ratServerPort");

        this.humanSensitiveServerPort = ConfigUtils.getInt("gfClient.sensitive.humanServerPort");
        this.mouseSensitiveServerPort = ConfigUtils.getInt("gfClient.sensitive.mouseServerPort");
        this.ratSensitiveServerPort = ConfigUtils.getInt("gfClient.sensitive.ratServerPort");
        // this.humanServerHost = ConfigUtils.getString( "gfClient.humanServerHost" );
        // this.mouseServerHost = ConfigUtils.getString( "gfClient.mouseServerHost" );
        // this.ratServerHost = ConfigUtils.getString( "gfClient.ratServerHost" );
        this.host = ConfigUtils.getString("gfClient.host");
        this.seqDir = ConfigUtils.getString("gfClient.seqDir");
        this.mouseSeqFiles = ConfigUtils.getString("gfClient.mouse.seqFiles");
        this.ratSeqFiles = ConfigUtils.getString("gfClient.rat.seqFiles");
        this.humanSeqFiles = ConfigUtils.getString("gfClient.human.seqFiles");
        this.gfClientExe = ConfigUtils.getString("gfClient.exe");
        this.gfServerExe = ConfigUtils.getString("gfServer.exe");

        if (gfServerExe == null) {
            /*
             * This won't ever really work -- it's left over from earlier iterations.
             */
            log.warn("You will not be able to start the server: gfServer.exe is not set in config");
        }

        if (gfClientExe == null && os.startsWith("windows")) {
            throw new ConfigurationException("BLAT client calls will not work under windows.");
        }

    }

    /**
     * @param querySequenceFile
     * @param outputPath
     * @return processed results.
     */
    private Collection<BlatResult> jniGfClientCall(final File querySequenceFile, final String outputPath,
            final int portToUse) throws IOException {
        try {
            log.debug("Starting blat run");

            FutureTask<Boolean> blatThread = new FutureTask<Boolean>(new Callable<Boolean>() {
                @Override
                public Boolean call() {
                    GfClientCall(host, Integer.toString(portToUse), seqDir, querySequenceFile.getPath(),
                            outputPath);
                    return true;
                }
            });

            ExecutorService executor = Executors.newSingleThreadExecutor();
            executor.execute(blatThread);
            executor.shutdown();

            // wait...
            StopWatch overallWatch = new StopWatch();
            overallWatch.start();

            while (!blatThread.isDone()) {
                try {
                    Thread.sleep(BLAT_UPDATE_INTERVAL_MS);
                } catch (InterruptedException ie) {
                    throw new RuntimeException(ie);
                }

                synchronized (outputPath) {
                    File outputFile = new File(outputPath);
                    Long size = outputFile.length();
                    NumberFormat nf = new DecimalFormat();
                    nf.setMaximumFractionDigits(2);
                    String minutes = TimeUtil.getMinutesElapsed(overallWatch);
                    log.info("BLAT output so far: " + nf.format(size / 1024.0) + " kb (" + minutes
                            + " minutes elapsed)");
                }

            }

            overallWatch.stop();
            String minutes = TimeUtil.getMinutesElapsed(overallWatch);
            log.info("Blat took a total of " + minutes + " minutes");

        } catch (UnsatisfiedLinkError e) {
            log.error(e, e);
            log.info("Falling back on exec()");
            this.execGfClient(querySequenceFile, outputPath, portToUse);
        }
        return this.processPsl(outputPath, null);
    }

    /**
     * @param filePath to the Blat output file in psl format
     * @return processed results.
     */
    private Collection<BlatResult> processPsl(String filePath, Taxon taxon) throws IOException {
        log.debug("Processing " + filePath);
        BlatResultParser brp = new BlatResultParser();
        brp.setTaxon(taxon);
        brp.setScoreThreshold(this.blatScoreThreshold);
        brp.parse(filePath);
        return brp.getResults();
    }

}