ubic.gemma.core.apps.ShellDelegatingBlat.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.apps.ShellDelegatingBlat.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.apps;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import ubic.gemma.core.analysis.sequence.SequenceManipulation;
import ubic.gemma.core.analysis.sequence.SequenceWriter;
import ubic.gemma.core.loader.genome.BlatResultParser;
import ubic.gemma.core.util.TimeUtil;
import ubic.gemma.core.util.concurrent.GenericStreamConsumer;
import ubic.gemma.model.common.description.DatabaseType;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResult;
import ubic.gemma.persistence.util.EntityUtils;
import ubic.gemma.persistence.util.Settings;

import java.io.*;
import java.net.Socket;
import java.net.UnknownHostException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;

/**
 * Class to manage the gfServer and run BLAT searches. Delegates to the command-line shell to run blat.
 *
 * @author pavlidis
 */
@SuppressWarnings("unused") // Possible external use
public class ShellDelegatingBlat implements Blat {

    private static final int BLAT_UPDATE_INTERVAL_MS = 1000 * 30;
    private static final Log log = LogFactory.getLog(ShellDelegatingBlat.class);
    /**
     * Minimum alignment length for retention.
     */
    private static final int MIN_SCORE = 16;
    /**
     * Strings of As or Ts at the start or end of a sequence longer than this will be stripped off prior to analysis.
     */
    private static final int POLY_AT_THRESHOLD = 5;
    private static final String os = System.getProperty("os.name").toLowerCase();
    private double blatScoreThreshold = Blat.DEFAULT_BLAT_SCORE_THRESHOLD;
    private boolean doShutdown = true;
    // typical values.
    private String gfClientExe = "/cygdrive/c/cygwin/usr/local/bin/gfClient.exe";
    private String gfServerExe = "/cygdrive/c/cygwin/usr/local/bin/gfServer.exe";
    private String host = "localhost";
    private int humanSensitiveServerPort;
    private String humanSeqFiles;
    private int humanServerPort;
    private int mouseSensitiveServerPort;
    private String mouseSeqFiles;
    private int mouseServerPort;
    private int ratSensitiveServerPort;
    private String ratSeqFiles;
    private int ratServerPort;
    private String seqDir = "/";
    private Process serverProcess;

    /**
     * Create a blat object with settings read from the config file.
     */
    public ShellDelegatingBlat() {
        try {
            this.init();
        } catch (ConfigurationException e) {
            throw new RuntimeException("Could not load configuration", e);
        }
    }

    public ShellDelegatingBlat(String host, int humanServerPort, String seqDir) {

        if (host == null || humanServerPort <= 0 || seqDir == null)
            throw new IllegalArgumentException("All values must be non-null");
        this.host = host;
        this.humanServerPort = humanServerPort;
        this.seqDir = seqDir;
    }

    public static ExternalDatabase getSearchedGenome(Taxon taxon) {
        BlattableGenome genome = ShellDelegatingBlat.inferBlatDatabase(taxon);
        ExternalDatabase searchedDatabase = ExternalDatabase.Factory.newInstance();
        searchedDatabase.setType(DatabaseType.SEQUENCE);
        searchedDatabase.setName(genome.toString().toLowerCase());
        return searchedDatabase;
    }

    private static BlattableGenome inferBlatDatabase(Taxon taxon) {
        assert taxon != null;

        BlattableGenome bg;

        if (taxon.getNcbiId() == 10090 || taxon.getCommonName().equals("mouse")) {
            bg = BlattableGenome.MOUSE;
        } else if (taxon.getNcbiId() == 10116 || taxon.getCommonName().equals("rat")) {
            bg = BlattableGenome.RAT;
        } else if (taxon.getNcbiId() == 9606 || taxon.getCommonName().equals("human")) {
            bg = BlattableGenome.HUMAN;
        } else {
            throw new UnsupportedOperationException("Cannot determine which database to search for " + taxon);
        }
        return bg;
    }

    @Override
    public Collection<BlatResult> blatQuery(BioSequence b) throws IOException {
        Taxon t = b.getTaxon();
        if (t == null) {
            throw new IllegalArgumentException("Cannot blat sequence unless taxon is given or inferrable");
        }

        return this.blatQuery(b, t, false);
    }

    @Override
    public Collection<BlatResult> blatQuery(BioSequence b, Taxon taxon, boolean sensitive) throws IOException {
        assert seqDir != null;
        // write the sequence to a temporary file.
        String seqName = b.getName().replaceAll(" ", "_");
        File querySequenceFile = File.createTempFile(seqName, ".fa");

        try (BufferedWriter out = new BufferedWriter(new FileWriter(querySequenceFile))) {
            String trimmed = SequenceManipulation.stripPolyAorT(b.getSequence(),
                    ShellDelegatingBlat.POLY_AT_THRESHOLD);
            out.write(">" + seqName + "\n" + trimmed);
            ShellDelegatingBlat.log.info("Wrote sequence to " + querySequenceFile.getPath());
        }
        String outputPath = this.getTmpPslFilePath(seqName);

        Collection<BlatResult> results = this.gfClient(querySequenceFile, outputPath,
                this.choosePortForQuery(taxon, sensitive));

        ExternalDatabase searchedDatabase = ShellDelegatingBlat.getSearchedGenome(taxon);
        for (BlatResult result : results) {
            result.setSearchedDatabase(searchedDatabase);
        }

        this.cleanUpTmpFiles(querySequenceFile, outputPath);
        return results;

    }

    @Override
    public Map<BioSequence, Collection<BlatResult>> blatQuery(Collection<BioSequence> sequences, boolean sensitive,
            Taxon taxon) throws IOException {
        Map<BioSequence, Collection<BlatResult>> results = new HashMap<>();

        File querySequenceFile = File.createTempFile("sequences-for-blat", ".fa");
        int count = SequenceWriter.writeSequencesToFile(sequences, querySequenceFile);
        if (count == 0) {
            EntityUtils.deleteFile(querySequenceFile);
            throw new IllegalArgumentException("No sequences!");
        }

        String outputPath = this.getTmpPslFilePath("blat-output");

        Integer port = this.choosePortForQuery(taxon, sensitive);

        if (port == null) {
            throw new IllegalStateException("Could not locate port for BLAT with settings taxon=" + taxon
                    + ", sensitive=" + sensitive + ", check your configuration.");
        }

        Collection<BlatResult> rawResults = this.gfClient(querySequenceFile, outputPath, port);

        ShellDelegatingBlat.log.info("Got " + rawResults.size() + " raw blat results");

        ExternalDatabase searchedDatabase = ShellDelegatingBlat.getSearchedGenome(taxon);

        for (BlatResult blatResult : rawResults) {
            blatResult.setSearchedDatabase(searchedDatabase);

            BioSequence query = blatResult.getQuerySequence();

            if (!results.containsKey(query)) {
                results.put(query, new HashSet<BlatResult>());
            }

            results.get(query).add(blatResult);
        }
        EntityUtils.deleteFile(querySequenceFile);
        return results;
    }

    @Override
    public Map<BioSequence, Collection<BlatResult>> blatQuery(Collection<BioSequence> sequences, Taxon taxon)
            throws IOException {
        return this.blatQuery(sequences, false, taxon);
    }

    @Override
    public double getBlatScoreThreshold() {
        return this.blatScoreThreshold;
    }

    @Override
    public void setBlatScoreThreshold(double blatScoreThreshold) {
        this.blatScoreThreshold = blatScoreThreshold;
    }

    @Override
    public String getGfClientExe() {
        return this.gfClientExe;
    }

    @Override
    public String getGfServerExe() {
        return this.gfServerExe;
    }

    @Override
    public String getHost() {
        return this.host;
    }

    @Override
    public int getHumanServerPort() {
        return this.humanServerPort;
    }

    @Override
    public int getMouseServerPort() {
        return this.mouseServerPort;
    }

    @Override
    public int getRatServerPort() {
        return this.ratServerPort;
    }

    @Override
    public String getSeqDir() {
        return this.seqDir;
    }

    @Override
    public String getSeqFiles(BlattableGenome genome) {
        switch (genome) {
        case HUMAN:
            return this.humanSeqFiles;
        case MOUSE:
            return this.mouseSeqFiles;
        case RAT:
            return this.ratSeqFiles;
        default:
            return this.humanSeqFiles;

        }
    }

    @Override
    public Collection<BlatResult> processPsl(InputStream inputStream, Taxon taxon) throws IOException {

        if (inputStream.available() == 0) {
            throw new IOException("No data from the blat output file. Make sure the gfServer is running");
        }

        ShellDelegatingBlat.log.debug("Processing " + inputStream);
        BlatResultParser brp = new BlatResultParser();
        brp.setTaxon(taxon);
        brp.setScoreThreshold(this.blatScoreThreshold);
        brp.parse(inputStream);
        ShellDelegatingBlat.log.info(brp.getNumSkipped() + " results were skipped as being below score= "
                + this.blatScoreThreshold + "; " + brp.getResults().size() + " results retained");
        return brp.getResults();
    }

    @Override
    public void startServer(BlattableGenome genome, int port) throws IOException {
        try (Socket socket = new Socket(host, port)) {
            ShellDelegatingBlat.log.info("There is already a server on port " + port);
            this.doShutdown = false;
        } catch (UnknownHostException e) {
            throw new RuntimeException("Unknown host " + host, e);
        } catch (IOException e) {
            String cmd = this.getGfServerExe() + " -canStop -stepSize=" + Blat.STEPSIZE + " start " + this.getHost()
                    + " " + port + " " + this.getSeqFiles(genome);
            ShellDelegatingBlat.log.info("Starting gfServer with command " + cmd);
            this.serverProcess = Runtime.getRuntime().exec(cmd, null, new File(this.getSeqDir()));

            try {
                Thread.sleep(100);
                int exit = serverProcess.exitValue();
                if (exit != 0) {
                    throw new IOException("Could not start server");
                }
            } catch (IllegalThreadStateException | InterruptedException e1) {
                ShellDelegatingBlat.log.info("Server seems to have started");
            }

        }
    }

    @Override
    public void stopServer(int port) {
        if (!doShutdown) {
            return;
        }
        ShellDelegatingBlat.log.info("Shutting down gfServer");

        if (serverProcess == null)
            return;
        // serverProcess.destroy();
        try {
            // this doesn't work unless the server was invoked with the option "-canStop"
            Process server = Runtime.getRuntime()
                    .exec(this.getGfServerExe() + " stop " + this.getHost() + " " + port);
            server.waitFor();
            int exit = server.exitValue();
            ShellDelegatingBlat.log.info("Server on port " + port + " shut down with exit value " + exit);
        } catch (InterruptedException | IOException e) {
            ShellDelegatingBlat.log.error(e, e);
        }

    }

    private Integer choosePortForQuery(Taxon taxon, boolean sensitive) {
        BlattableGenome genome = ShellDelegatingBlat.inferBlatDatabase(taxon);
        switch (genome) {
        case MOUSE:
            return sensitive ? mouseSensitiveServerPort : mouseServerPort;
        case RAT:
            return sensitive ? ratSensitiveServerPort : ratServerPort;
        case HUMAN:
        default:
            return sensitive ? humanSensitiveServerPort : humanServerPort;

        }
    }

    private void cleanUpTmpFiles(File querySequenceFile, String outputPath) {
        if (!querySequenceFile.delete() || !(new File(outputPath)).delete()) {
            ShellDelegatingBlat.log.warn("Could not clean up temporary files.");
        }
    }

    /**
     * Run a gfClient query, using a call to exec().
     *
     * @param querySequenceFile query sequence file
     * @param outputPath        output path
     * @return collection of blat results
     */
    private Collection<BlatResult> execGfClient(File querySequenceFile, String outputPath, int portToUse)
            throws IOException {
        final String cmd = gfClientExe + " -nohead -minScore=" + ShellDelegatingBlat.MIN_SCORE + " " + host + " "
                + portToUse + " " + seqDir + " " + querySequenceFile.getAbsolutePath() + " " + outputPath;
        ShellDelegatingBlat.log.info(cmd);

        final Process run = Runtime.getRuntime().exec(cmd);

        // to ensure that we aren't left waiting for these streams
        GenericStreamConsumer gscErr = new GenericStreamConsumer(run.getErrorStream());
        GenericStreamConsumer gscIn = new GenericStreamConsumer(run.getInputStream());
        gscErr.start();
        gscIn.start();

        try {

            int exitVal = Integer.MIN_VALUE;

            // wait...
            StopWatch overallWatch = new StopWatch();
            overallWatch.start();

            while (exitVal == Integer.MIN_VALUE) {
                try {
                    exitVal = run.exitValue();
                } catch (IllegalThreadStateException e) {
                    // okay, still
                    // waiting.
                }
                Thread.sleep(ShellDelegatingBlat.BLAT_UPDATE_INTERVAL_MS);
                // I hope this is okay...
                this.outputFile(outputPath, overallWatch);
            }

            overallWatch.stop();
            String minutes = TimeUtil.getMinutesElapsed(overallWatch);
            ShellDelegatingBlat.log.info("Blat took a total of " + minutes + " minutes");

            // int exitVal = run.waitFor();

            ShellDelegatingBlat.log.debug("blat exit value=" + exitVal);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        ShellDelegatingBlat.log.debug("GfClient Success");

        return this.processPsl(outputPath, null);
    }

    /**
     * Get a temporary file name.
     *
     * @throws IOException if there is an IO problem while accessing the file
     */
    private String getTmpPslFilePath(String base) throws IOException {
        File tmpDir = new File(Settings.getDownloadPath());
        if (StringUtils.isBlank(base)) {
            return File.createTempFile("blat-output", ".psl", tmpDir).getPath();
        }
        return File.createTempFile(base, ".psl", tmpDir).getPath();
    }

    /**
     * @param querySequenceFile query sequence file
     * @param outputPath        output path
     * @return processed results.
     * @throws IOException if there is an IO problem while accessing the file
     */
    private Collection<BlatResult> gfClient(File querySequenceFile, String outputPath, int portToUse)
            throws IOException {
        // if ( hasNativeLibrary ) return jniGfClientCall( querySequenceFile, outputPath, portToUse );

        return this.execGfClient(querySequenceFile, outputPath, portToUse);
    }

    private native void GfClientCall(String h, String p, String dir, String input, String output);

    private void init() throws ConfigurationException {

        ShellDelegatingBlat.log.debug("Reading global config");
        this.humanServerPort = Settings.getInt("gfClient.humanServerPort");
        this.mouseServerPort = Settings.getInt("gfClient.mouseServerPort");
        this.ratServerPort = Settings.getInt("gfClient.ratServerPort");

        this.humanSensitiveServerPort = Settings.getInt("gfClient.sensitive.humanServerPort");
        this.mouseSensitiveServerPort = Settings.getInt("gfClient.sensitive.mouseServerPort");
        this.ratSensitiveServerPort = Settings.getInt("gfClient.sensitive.ratServerPort");
        // this.humanServerHost = ConfigUtils.getString( "gfClient.humanServerHost" );
        // this.mouseServerHost = ConfigUtils.getString( "gfClient.mouseServerHost" );
        // this.ratServerHost = ConfigUtils.getString( "gfClient.ratServerHost" );
        this.host = Settings.getString("gfClient.host");
        this.seqDir = Settings.getString("gfClient.seqDir");
        this.mouseSeqFiles = Settings.getString("gfClient.mouse.seqFiles");
        this.ratSeqFiles = Settings.getString("gfClient.rat.seqFiles");
        this.humanSeqFiles = Settings.getString("gfClient.human.seqFiles");
        this.gfClientExe = Settings.getString("gfClient.exe");
        this.gfServerExe = Settings.getString("gfServer.exe");

        if (gfServerExe == null) {
            /*
             * This won't ever really work -- it's left over from earlier iterations.
             */
            ShellDelegatingBlat.log
                    .warn("You will not be able to start the server: gfServer.exe is not set in config");
        }

        if (gfClientExe == null && ShellDelegatingBlat.os.startsWith("windows")) {
            throw new ConfigurationException("BLAT client calls will not work under windows.");
        }

    }

    /**
     * @param querySequenceFile query sequence file
     * @param outputPath        output path
     * @return processed results.
     */
    private Collection<BlatResult> jniGfClientCall(final File querySequenceFile, final String outputPath,
            final int portToUse) throws IOException {
        try {
            ShellDelegatingBlat.log.debug("Starting blat run");

            FutureTask<Boolean> blatThread = new FutureTask<>(new Callable<Boolean>() {
                @Override
                public Boolean call() {
                    ShellDelegatingBlat.this.GfClientCall(host, Integer.toString(portToUse), seqDir,
                            querySequenceFile.getPath(), outputPath);
                    return true;
                }
            });

            ExecutorService executor = Executors.newSingleThreadExecutor();
            executor.execute(blatThread);
            executor.shutdown();

            // wait...
            StopWatch overallWatch = new StopWatch();
            overallWatch.start();

            while (!blatThread.isDone()) {
                try {
                    Thread.sleep(ShellDelegatingBlat.BLAT_UPDATE_INTERVAL_MS);
                } catch (InterruptedException ie) {
                    throw new RuntimeException(ie);
                }
                this.outputFile(outputPath, overallWatch);
            }

            overallWatch.stop();
            String minutes = TimeUtil.getMinutesElapsed(overallWatch);
            ShellDelegatingBlat.log.info("Blat took a total of " + minutes + " minutes");

        } catch (UnsatisfiedLinkError e) {
            ShellDelegatingBlat.log.error(e, e);
            ShellDelegatingBlat.log.info("Falling back on exec()");
            this.execGfClient(querySequenceFile, outputPath, portToUse);
        }
        return this.processPsl(outputPath, null);
    }

    private synchronized void outputFile(final String outputPath, StopWatch overallWatch) {
        File outputFile = new File(outputPath);
        Long size = outputFile.length();
        NumberFormat nf = new DecimalFormat();
        nf.setMaximumFractionDigits(2);
        String minutes = TimeUtil.getMinutesElapsed(overallWatch);
        ShellDelegatingBlat.log
                .info("BLAT output so far: " + nf.format(size / 1024.0) + " kb (" + minutes + " minutes elapsed)");

    }

    /**
     * @param filePath to the Blat output file in psl format
     * @param taxon    taxon (optional, can be null)
     * @return processed results.
     */
    private Collection<BlatResult> processPsl(String filePath, Taxon taxon) throws IOException {
        ShellDelegatingBlat.log.debug("Processing " + filePath);
        BlatResultParser brp = new BlatResultParser();
        brp.setTaxon(taxon);
        brp.setScoreThreshold(this.blatScoreThreshold);
        brp.parse(filePath);
        return brp.getResults();
    }

    public enum BlattableGenome {
        HUMAN, MOUSE, RAT
    }

}