act.installer.reachablesexplorer.WordCloudGenerator.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.reachablesexplorer.WordCloudGenerator.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.reachablesexplorer;

import act.server.DBIterator;
import act.server.MongoDB;
import com.act.biointerpretation.l2expansion.L2InchiCorpus;
import com.act.jobs.FileChecker;
import com.act.utils.CLIUtil;
import com.act.utils.ProcessRunner;
import com.act.workflow.tool_manager.workflow.workflow_mixins.mongo.ChemicalKeywords;
import com.act.workflow.tool_manager.workflow.workflow_mixins.mongo.MongoKeywords;
import com.mongodb.BasicDBObject;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class WordCloudGenerator {

    /**
     * This class allow the generation of wordclouds, using R, for any inchi having a Bing reference.
     * It requires an R script, that takes an InChI as argument and writes a word cloud to a file
     */

    private static final String RSCRIPT_EXE_PATH = "/usr/bin/Rscript"; // TODO: find this using `env` instead.
    private static final String RSCRIPT_LOCATION = "src/main/r/RWordCloudGenerator.R";
    private static final Logger LOGGER = LogManager.getFormatterLogger(WordCloudGenerator.class);
    private static final String PNG_EXTENSION = ".png";
    private static final long CHILD_PROCESS_TIMEOUT_IN_SECONDS = 60; // Thomas thinks this is plenty of time for a cloud.

    private static final String OPTION_DB_HOST = "H";
    private static final String OPTION_DB_PORT = "p";
    private static final String OPTION_INSTALLER_SOURCE_DB = "i";
    private static final String OPTION_RENDERING_CACHE = "e";
    private static final String OPTION_INPUT_INCHIS = "l";
    private static final String OPTION_RSCRIPT_EXE_PATH = "r";

    private static final String DEFAULT_ASSETS_LOCATION = "data/reachables-explorer-rendering-cache";

    // Default host. If running on a laptop, please set a SSH bridge to access speakeasy
    private static final String DEFAULT_HOST = "localhost";
    private static final String DEFAULT_PORT = "27017";
    private static final String DEFAULT_CHEMICALS_DATABASE = "SHOULD_COME_FROM_CMDLINE"; // "jarvis_2016-12-09";

    public static final String HELP_MESSAGE = StringUtils.join(
            new String[] { "This class allows WordCloud generation as a separate process from the Loader" }, " ");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_DB_HOST).argName("DB host")
                    .desc(String.format("The database host to which to connect (default: %s)", DEFAULT_HOST))
                    .hasArg().longOpt("db-host"));
            add(Option.builder(OPTION_DB_PORT).argName("DB port")
                    .desc(String.format("The port on which to connect to the database (default: %s)", DEFAULT_PORT))
                    .hasArg().longOpt("db-port"));
            add(Option.builder(OPTION_INSTALLER_SOURCE_DB).argName("DB name")
                    .desc(String.format(
                            "The name of the database from which to fetch chemicals and reactions (default: %s)",
                            DEFAULT_CHEMICALS_DATABASE))
                    .hasArg().longOpt("source-db-name").required());
            add(Option.builder(OPTION_RENDERING_CACHE).argName("path to cache")
                    .desc(String.format(
                            "A directory in which to cache rendered images for reachables documents (default: %s)",
                            DEFAULT_ASSETS_LOCATION))
                    .hasArg().longOpt("cache-dir"));
            add(Option.builder(OPTION_INPUT_INCHIS).argName("path to inchis list")
                    .desc("A list of input inchis for which to compute word clouds").hasArg().required()
                    .longOpt("inchis-path"));
            add(Option.builder(OPTION_RSCRIPT_EXE_PATH).argName("rscript exe path").desc(String.format(
                    "The path to the Rscript exe for running R scripts. Default is %s. Can be determined by running \"which Rscript\"",
                    RSCRIPT_EXE_PATH)).hasArg().required().longOpt("r-location"));
        }
    };

    private File rScript;
    private String rScriptExePath;

    private String host;
    private Integer port;
    private String database;
    private MongoDB bingDb;
    private Set<String> inchisSet;
    private File assetLocation;

    public WordCloudGenerator(String host, Integer port, String database, String assetLocation,
            String rScriptExePath) {
        this.host = host;
        this.port = port;
        this.database = database;
        this.bingDb = new MongoDB(host, port, database);
        this.inchisSet = getBingInchis();
        this.assetLocation = new File(assetLocation);
        this.rScript = new File(RSCRIPT_LOCATION);
        this.rScriptExePath = rScriptExePath;
        try {
            FileChecker.verifyInputFile(this.rScript);
        } catch (IOException e) {
            String msg = String.format("Failed to locate R script at %s", this.rScript.getAbsolutePath());
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        }
        if (!this.assetLocation.exists() || !this.assetLocation.isDirectory()) {
            String msg = String.format("Failed to locate asset location directory at %s",
                    this.assetLocation.getAbsolutePath());
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        }
    }

    public WordCloudGenerator(String host, Integer port, String database, String assetLocation) {
        this(host, port, database, assetLocation, RSCRIPT_EXE_PATH);
    }

    public WordCloudGenerator(String host, Integer port, String database) {
        this(host, port, database, DEFAULT_ASSETS_LOCATION, RSCRIPT_EXE_PATH);
    }

    public Set<String> getBingInchis() {

        BasicDBObject query = new BasicDBObject("xref.BING.metadata.usage_terms.0",
                new BasicDBObject(MongoKeywords.EXISTS$.MODULE$.value(), true));
        BasicDBObject keys = new BasicDBObject(ChemicalKeywords.INCHI$.MODULE$.value(), true);

        DBIterator ite = bingDb.getIteratorOverChemicals(query, keys);
        Set<String> bingSet = new HashSet<>();
        while (ite.hasNext()) {
            BasicDBObject o = (BasicDBObject) ite.next();
            String inchi = o.getString(ChemicalKeywords.INCHI$.MODULE$.value());
            if (inchi != null) {
                bingSet.add(inchi);
            }
        }
        return bingSet;
    }

    public File getWordcloudFile(String inchi) {
        String md5 = DigestUtils.md5Hex(inchi);
        String postfix = new StringBuilder("-").append(md5).append(PNG_EXTENSION).toString();

        String wordcloudFilename = String.join("", "wordcloud", postfix);

        return Paths.get(this.assetLocation.getPath(), wordcloudFilename).toFile();
    }

    public File generateWordCloud(String inchi) {

        // TODO: improve wordcloud generation. Currently, each instance open a mongo connection on the R side.
        // By doing data manipulation in Java and utilizing Rengine, we could make this much better
        // Wordclouds could be generated ahead of time this way, using the inchi coprus

        File wordcloud = getWordcloudFile(inchi);

        if (!Files.exists(wordcloud.toPath()) && inchisSet.contains(inchi)) {
            try {
                ProcessRunner.runProcess(rScriptExePath,
                        // TODO: remove hardcoded database from R script
                        Arrays.asList(rScript.getAbsolutePath(), inchi, wordcloud.getAbsolutePath(), host,
                                port.toString(), database),
                        CHILD_PROCESS_TIMEOUT_IN_SECONDS);
                FileChecker.verifyInputFile(wordcloud);
            } catch (IOException e) {
                LOGGER.error("Unable to generate wordcloud for %s at location %s", inchi,
                        wordcloud.toPath().toString());
                return null;
            } catch (InterruptedException e) {
                LOGGER.error("Child process was interrupted: %s", e.getMessage());
                return null;
            }
        }
        return wordcloud;
    }

    public static void main(String[] args) {

        CLIUtil cliUtil = new CLIUtil(Loader.class, HELP_MESSAGE, OPTION_BUILDERS);
        CommandLine cl = cliUtil.parseCommandLine(args);

        // TODO add possibility to run wordcloud generation as a post processing step, from a loaded reachables database
        File inchisFile = new File(cl.getOptionValue(OPTION_INPUT_INCHIS));
        L2InchiCorpus inchiCorpus = new L2InchiCorpus();
        try {
            inchiCorpus.loadCorpus(inchisFile);
        } catch (IOException e) {
            cliUtil.failWithMessage("Could not load inchi corpus from input file %s", inchisFile.getAbsolutePath());
        }
        WordCloudGenerator wordCloudGenerator = new WordCloudGenerator(
                cl.getOptionValue(OPTION_DB_HOST, DEFAULT_HOST),
                Integer.parseInt(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_PORT)),
                cl.getOptionValue(OPTION_INSTALLER_SOURCE_DB, DEFAULT_CHEMICALS_DATABASE),
                cl.getOptionValue(OPTION_RENDERING_CACHE, DEFAULT_ASSETS_LOCATION),
                cl.getOptionValue(OPTION_RSCRIPT_EXE_PATH, RSCRIPT_EXE_PATH));
        inchiCorpus.getInchiList().forEach(wordCloudGenerator::generateWordCloud);
    }
}