act.installer.bing.BingSearchRanker.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.bing.BingSearchRanker.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.bing;

import act.server.MongoDB;
import com.act.utils.TSVWriter;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.utils.URIBuilder;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * This module provide a command line interface to update and export Bing Search results and ranks from the Installer
 * database. It supports two types of input: raw list of InChI and TSV file with an InChI header.
 * Usage (raw input):
 *       sbt 'runMain act.installer.bing.BingSearchRanker
 *                -i MNT_SHARED_DATA/Thomas/bing_ranker/l2chemicalsProductFiltered.txt
 *                -o MNT_SHARED_DATA/Thomas/bing_ranker/l2chemicalsProductFiltered_BingSearchRanker_results.tsv'
 * Usage (TSV input):
 *       sbt 'runMain act.installer.bing.BingSearchRanker
 *                -i MNT_SHARED_DATA/Thomas/bing_ranker/benzene_search_results_wikipedia_20160617T1723.txt.hits
 *                -o MNT_SHARED_DATA/Thomas/bing_ranker/benzene_search_results_wikipedia_BingSearchRanker_results.tsv'
 *                -t
 * Usage (TSV input & all extra options, including force update):
 *       sbt 'runMain act.installer.bing.BingSearchRanker
 *                -i MNT_SHARED_DATA/Thomas/bing_ranker/benzene_search_results_wikipedia_20160617T1723.txt.hits
 *                -o MNT_SHARED_DATA/Thomas/bing_ranker/benzene_search_results_wikipedia_BingSearchRanker_results.tsv'
 *                -t -c -w -u -f
 */

public class BingSearchRanker {

    private static final Logger LOGGER = LogManager.getFormatterLogger(BingSearchRanker.class);
    private static final String EMPTY_STRING = "";

    // Default configuration for the Installer database
    public static final String DEFAULT_HOST = "localhost";
    public static final int DEFAULT_PORT = 27017;
    public static final String DEFAULT_INSTALLER_DATABASE = "actv01";

    // Configuration for usage explorer UI
    public static final String HOST_USAGE_EXPLORER = "usage-explorer";
    public static final int PORT_USAGE_EXPLORER = 8080;

    // Define options for CLI
    public static final String OPTION_INPUT_FILEPATH = "i";
    public static final String OPTION_OUTPUT_FILEPATH = "o";
    public static final String OPTION_TSV_INPUT = "t";
    public static final String OPTION_FORCE_UPDATE = "f";
    public static final String OPTION_INCLUDE_CHEBI_APPLICATIONS = "c";
    public static final String OPTION_INCLUDE_WIKIPEDIA_URL = "w";
    public static final String OPTION_INCLUDE_USAGE_EXPLORER_URL = "u";

    // Other static variables
    public static final Integer DEFAULT_COUNT = 0;
    private static final Integer INCHI_CHUNK_SIZE = 10000;

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class adds Bing Search results for a list of molecules in the Installer (actv01) database",
            "and exports the results in a TSV format for easy import in Google spreadsheets.",
            "It supports two different input formats: raw list of InChI strings and TSV file with an InChI column.",
            "Default input format (with only options -i and -o) is raw list of InChI." }, " ");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_INPUT_FILEPATH).argName("INPUT_FILEPATH")
                    .desc("The full path to the input file").hasArg().required().longOpt("input_filepath")
                    .type(String.class));
            add(Option.builder(OPTION_OUTPUT_FILEPATH).argName("OUTPUT_PATH")
                    .desc("The full path where to write the output.").hasArg().required().longOpt("output_path")
                    .type(String.class));
            add(Option.builder(OPTION_TSV_INPUT).argName("TSV_INPUT")
                    .desc("Whether the input is a TSV file with an InChI column.").longOpt("tsv")
                    .type(boolean.class));
            add(Option.builder(OPTION_FORCE_UPDATE).argName("FORCE_UPDATE")
                    .desc("Whether exisitng BING cross-references in the Installer database should be overwritten.")
                    .longOpt("force_update").type(boolean.class));
            add(Option.builder(OPTION_INCLUDE_CHEBI_APPLICATIONS).argName("INCLUDE_CHEBI_APPLICATIONS")
                    .desc("Whether to include (when applicable) ChEBI applications in the output file.")
                    .longOpt("include_chebi").type(boolean.class));
            add(Option.builder(OPTION_INCLUDE_WIKIPEDIA_URL).argName("INCLUDE_WIKIPEDIA_URL")
                    .desc("Whether to include (when applicable) the Wikipedia URL in the output file.")
                    .longOpt("include_wikipedia").type(boolean.class));
            add(Option.builder(OPTION_INCLUDE_USAGE_EXPLORER_URL).argName("INCLUDE_USAGE_EXPLORER_URL")
                    .desc("Whether to include (when applicable) the usage explorer UI URL in the output file.")
                    .longOpt("include_usage").type(boolean.class));
            add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help"));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    public enum BingRankerHeaderFields {
        INCHI, BEST_NAME, TOTAL_COUNT_SEARCH_RESULTS, ALL_NAMES, WIKIPEDIA_URL, CHEBI_MAIN_APPLICATIONS, CHEBI_DIRECT_APPLICATIONS, USAGE_EXPLORER_URL
    }

    public enum ConditionalReachabilityHeaderFields {
        DEPTH, ROOT_MOLECULE_BEST_NAME, ROOT_INCHI, TOTAL_COUNT_SEARCH_RESULTS_ROOT
    }

    // Instance variables
    private MongoDB mongoDB;
    private BingSearcher bingSearcher;
    private Boolean includeChebiApplications;
    private Boolean includeWikipediaUrl;
    private Boolean includeUsageExplorerUrl;

    public BingSearchRanker() {
        this(false, false, false, false);
    }

    public BingSearchRanker(Boolean includeChebiApplications, Boolean includeWikipediaUrl,
            Boolean includeUsageExplorerUrl, Boolean forceUpdate) {
        this.mongoDB = new MongoDB(DEFAULT_HOST, DEFAULT_PORT, DEFAULT_INSTALLER_DATABASE);
        this.bingSearcher = new BingSearcher(this.mongoDB, forceUpdate, false);
        this.includeChebiApplications = includeChebiApplications;
        this.includeWikipediaUrl = includeWikipediaUrl;
        this.includeUsageExplorerUrl = includeUsageExplorerUrl;
    }

    public static void main(final String[] args) throws Exception {

        // Parse the command line options
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(BingSearchRanker.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(BingSearchRanker.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        String inputPath = cl.getOptionValue(OPTION_INPUT_FILEPATH);
        String outputPath = cl.getOptionValue(OPTION_OUTPUT_FILEPATH);
        Boolean isTSVInput = cl.hasOption(OPTION_TSV_INPUT);

        // Read the molecule corpus
        LOGGER.info("Reading the input molecule corpus");
        MoleculeCorpus moleculeCorpus = new MoleculeCorpus();
        if (isTSVInput) {
            LOGGER.info("Input format is TSV");
            moleculeCorpus.buildCorpusFromTSVFile(inputPath);
        } else {
            LOGGER.info("Input format is raw InChIs");
            moleculeCorpus.buildCorpusFromRawInchis(inputPath);
        }

        // Get the inchi set
        Set<String> inchis = moleculeCorpus.getMolecules();
        LOGGER.info("Found %d molecules in the input corpus", inchis.size());

        // Update the Bing Search results in the Installer database
        BingSearchRanker bingSearchRanker = new BingSearchRanker(cl.hasOption(OPTION_INCLUDE_CHEBI_APPLICATIONS),
                cl.hasOption(OPTION_INCLUDE_WIKIPEDIA_URL), cl.hasOption(OPTION_INCLUDE_USAGE_EXPLORER_URL),
                cl.hasOption(OPTION_FORCE_UPDATE));
        LOGGER.info("Updating the Bing Search results in the Installer database");
        bingSearchRanker.addBingSearchResults(inchis);
        LOGGER.info("Done updating the Bing Search results");

        // Write the results in a TSV file
        LOGGER.info("Writing results to output file");
        bingSearchRanker.writeBingSearchRanksAsTSV(inchis, outputPath);
        LOGGER.info("Bing Search ranker is done. \"I'm tired, boss.\"");
    }

    /**
     * This function constructs the Usage Explorer URL for TSV export
     * @param inchi the InChI string representation of the molecule
     * @return a String with the link to access the Usage Explorer app.
     */
    public String getUsageExplorerURLStringFromInchi(String inchi) {
        try {
            URI uri = new URIBuilder().setScheme("http").setHost(HOST_USAGE_EXPLORER).setPort(PORT_USAGE_EXPLORER)
                    .setParameter("inchi", inchi).build();
            return uri.toString();
        } catch (URISyntaxException e) {
            LOGGER.error("An error occurred when trying to build the Usage Explorer URI", e);
        }
        return null;
    }

    /**
     * This function add the Bing Search results to the installer database from a set of InChI strings
     * @param inchis set of InChI string representations
     */
    public void addBingSearchResults(Set<String> inchis) throws IOException {
        bingSearcher.addBingSearchResultsForInchiSet(inchis);
    }

    /**
     * Add InChI, names and usage information related headers to a list of header fields.
     * @param headerFields List of headers to be populated
     */
    private void addChemicalHeaders(List<String> headerFields) {
        headerFields.add(BingRankerHeaderFields.INCHI.name());
        headerFields.add(BingRankerHeaderFields.BEST_NAME.name());
        headerFields.add(BingRankerHeaderFields.TOTAL_COUNT_SEARCH_RESULTS.name());
        headerFields.add(BingRankerHeaderFields.ALL_NAMES.name());
        if (includeChebiApplications) {
            headerFields.add(BingRankerHeaderFields.CHEBI_MAIN_APPLICATIONS.name());
            headerFields.add(BingRankerHeaderFields.CHEBI_DIRECT_APPLICATIONS.name());
        }
        if (includeWikipediaUrl) {
            headerFields.add(BingRankerHeaderFields.WIKIPEDIA_URL.name());
        }
        if (includeUsageExplorerUrl) {
            headerFields.add(BingRankerHeaderFields.USAGE_EXPLORER_URL.name());
        }
    }

    /**
     * Updates a TSV row (actually a Map from header to value) with InChI, names and usage information.
     * @param o BasicDBObject containing InChI, and xrefs.{BING, CHEBI, WIKIPEDIA} info
     * @param row TSV row (map from TSV header to value) to be updated
     */
    private void updateRowWithChemicalInformation(BasicDBObject o, Map<String, String> row) {
        String inchi = o.get("InChI").toString();
        row.put(BingRankerHeaderFields.INCHI.name(), inchi);
        BasicDBObject xref = (BasicDBObject) o.get("xref");
        BasicDBObject bing = (BasicDBObject) xref.get("BING");
        BasicDBObject bingMetadata = (BasicDBObject) bing.get("metadata");
        row.put(BingRankerHeaderFields.BEST_NAME.name(), bingMetadata.get("best_name").toString());
        row.put(BingRankerHeaderFields.TOTAL_COUNT_SEARCH_RESULTS.name(),
                bingMetadata.get("total_count_search_results").toString());
        NamesOfMolecule namesOfMolecule = mongoDB.getNamesFromBasicDBObject(o);
        Set<String> names = namesOfMolecule.getAllNames();
        row.put(BingRankerHeaderFields.ALL_NAMES.name(), names.toString());
        if (includeChebiApplications) {
            BasicDBObject chebi = (BasicDBObject) xref.get("CHEBI");
            if (chebi != null) {
                BasicDBObject chebiMetadata = (BasicDBObject) chebi.get("metadata");
                BasicDBObject chebiApplications = (BasicDBObject) chebiMetadata.get("applications");
                if (chebiApplications != null) {
                    row.put(BingRankerHeaderFields.CHEBI_MAIN_APPLICATIONS.name(),
                            chebiApplications.get("main_applications").toString());
                    row.put(BingRankerHeaderFields.CHEBI_DIRECT_APPLICATIONS.name(),
                            chebiApplications.get("direct_applications").toString());
                } else {
                    LOGGER.debug("ChEBI cross-reference found, but no ChEBI applications for %s", inchi);
                    row.put(BingRankerHeaderFields.CHEBI_MAIN_APPLICATIONS.name(), EMPTY_STRING);
                    row.put(BingRankerHeaderFields.CHEBI_DIRECT_APPLICATIONS.name(), EMPTY_STRING);
                }
            } else {
                LOGGER.debug("No ChEBI cross-reference found for %s", inchi);
            }
        }
        if (includeWikipediaUrl) {
            BasicDBObject wikipedia = (BasicDBObject) xref.get("WIKIPEDIA");
            if (wikipedia != null) {
                row.put(BingRankerHeaderFields.WIKIPEDIA_URL.name(), wikipedia.get("dbid").toString());
            } else {
                LOGGER.debug("No Wikipedia cross-reference found for %s", inchi);
                row.put(BingRankerHeaderFields.WIKIPEDIA_URL.name(), EMPTY_STRING);
            }
        }
        if (includeUsageExplorerUrl) {
            row.put(BingRankerHeaderFields.USAGE_EXPLORER_URL.name(), getUsageExplorerURLStringFromInchi(inchi));
        }
    }

    /**
     * Divide a large set of Strings into a list of smaller sets (chunks) of size `chunkSize`
     * @param inchis set of String (possibly representing InChIs)
     * @param chunkSize (Integer) the size of resulting chunks
     * @return inchiChunks: a list of "chunks", smaller sets of strings
     */
    private List<Set<String>> getInchiChunks(Set<String> inchis, Integer chunkSize) {
        List<Set<String>> inchiChunks = new ArrayList<>();
        Set<String> inchiChunk = new HashSet<>();
        for (String inchi : inchis) {
            inchiChunk.add(inchi);
            if (inchiChunk.size() == chunkSize) {
                inchiChunks.add(inchiChunk);
                inchiChunk = new HashSet<>();
            }
        }
        if (inchiChunk.size() > 0) {
            inchiChunks.add(inchiChunk);
        }
        return inchiChunks;
    }

    /**
     * This function writes the Bing Search ranks for a chunk of inchis in a TSV file, append only option.
     * @param inchis (Set<String>) set of InChI string representations
     * @param outputPath (String) path indicating the output file
     * @param appendOutput (Boolean) whether to append the results to the output file
     * @throws IOException
     */
    private void writeBingSearchRanksAsTSVForInchiChunk(Set<String> inchis, String outputPath, Boolean appendOutput)
            throws IOException {

        // Define headers
        List<String> bingRankerHeaderFields = new ArrayList<>();
        addChemicalHeaders(bingRankerHeaderFields);

        // Open TSV writer
        try (TSVWriter<String, String> tsvWriter = new TSVWriter<>(bingRankerHeaderFields)) {
            tsvWriter.open(new File(outputPath), appendOutput);

            int counter = 0;
            DBCursor cursor = mongoDB.fetchNamesAndUsageForInchis(inchis);

            // Iterate through the target chemicals
            while (cursor.hasNext()) {
                counter++;
                BasicDBObject o = (BasicDBObject) cursor.next();
                Map<String, String> row = new HashMap<>();
                updateRowWithChemicalInformation(o, row);
                tsvWriter.append(row);
                tsvWriter.flush();
            }
            LOGGER.info("Wrote %d Bing Search results to %s", counter, outputPath);
        }
    }

    /**
     * This function writes the Bing Search ranks for a specific set of inchis in a TSV file.
     * @param inchis set of InChI string representations
     * @param outputPath path indicating the output file
     * @throws IOException
     */
    public void writeBingSearchRanksAsTSV(Set<String> inchis, String outputPath) throws IOException {

        List<Set<String>> inchiChunks = getInchiChunks(inchis, INCHI_CHUNK_SIZE);
        LOGGER.info("%d chunks of maximum size %d were found!", inchiChunks.size(), INCHI_CHUNK_SIZE);
        if (inchiChunks.size() == 0) {
            LOGGER.info("No chunks found. Exiting!");
            System.exit(1);
        }
        writeBingSearchRanksAsTSVForInchiChunk(inchiChunks.get(0), outputPath, false);
        for (int chunkIndex = 1; chunkIndex < inchiChunks.size(); chunkIndex++) {
            writeBingSearchRanksAsTSVForInchiChunk(inchiChunks.get(chunkIndex), outputPath, true);
        }
    }

    /**
     * This function is used to write out the conditional reachability results with data on target chemical, root chemical,
     * depth of steps from root to target chemical, the bing search results, all the other names associated with the target
     * and inchi of the target in a tsv file. This function is not scalable since it has to have an in-memory representation
     * of the target and root molecule's bing results to input the data into the TSV file.
     * @param descendantInchiToRootInchi mapping of chemical to its root chemical in the conditional reachability tree
     * @param depthOfPathFromRootToMolecule Since a chemical can be associated with only one root, there is a unique mapping between
     *                        the chemical and it's depth from the root. This structure holds that information.
     * @param outputPath The output path of the tsv file.
     * @throws IOException
     */
    public void writeBingSearchRanksAsTSVUsingConditionalReachabilityFormat(Set<String> inchisToProcess,
            Map<String, String> descendantInchiToRootInchi, Map<String, Integer> depthOfPathFromRootToMolecule,
            String outputPath) throws IOException {

        // Define headers
        List<String> bingRankerHeaderFields = new ArrayList<>();
        addChemicalHeaders(bingRankerHeaderFields);
        bingRankerHeaderFields.add(ConditionalReachabilityHeaderFields.DEPTH.name());
        bingRankerHeaderFields.add(ConditionalReachabilityHeaderFields.ROOT_MOLECULE_BEST_NAME.name());
        bingRankerHeaderFields.add(ConditionalReachabilityHeaderFields.TOTAL_COUNT_SEARCH_RESULTS_ROOT.name());
        bingRankerHeaderFields.add(ConditionalReachabilityHeaderFields.ROOT_INCHI.name());

        LOGGER.info("The total number of inchis are: %d", inchisToProcess.size());

        LOGGER.info("Creating mappings between inchi and it's DB object");
        DBCursor cursor = mongoDB.fetchNamesAndUsageForInchis(inchisToProcess);

        // TODO: We have to do an in-memory calculation of all the inchis since we need to pair up the descendant and root
        // db objects. This can take up a lot of memory.
        Map<String, BasicDBObject> inchiToDBObject = new HashMap<>();

        int cursorCounter = 0;
        while (cursor.hasNext()) {
            cursorCounter++;
            BasicDBObject o = (BasicDBObject) cursor.next();
            String inchi = o.get("InChI").toString();

            if (inchi == null) {
                LOGGER.error("Inchi could not be parsed.");
                continue;
            }

            inchiToDBObject.put(inchi, o);
        }

        LOGGER.info("The total number of inchis found in the db is: %d", cursorCounter);

        LOGGER.info("Going to write to TSV file.");
        try (TSVWriter<String, String> tsvWriter = new TSVWriter<>(bingRankerHeaderFields)) {
            tsvWriter.open(new File(outputPath));

            int counter = 0;

            for (String descendantInchi : descendantInchiToRootInchi.keySet()) {
                // Add all the descendant field results
                BasicDBObject descendentDBObject = inchiToDBObject.get(descendantInchi);
                if (descendentDBObject == null) {
                    LOGGER.info("Could not find info on inchi %s", descendantInchi);
                    continue;
                }

                // Add all descendant molecule fields
                Map<String, String> row = new HashMap<>();
                updateRowWithChemicalInformation(descendentDBObject, row);

                // Add all the root molecule fields
                String rootInchi = descendantInchiToRootInchi.get(descendantInchi);
                row.put(ConditionalReachabilityHeaderFields.ROOT_INCHI.name(), rootInchi);
                BasicDBObject rootDBObject = inchiToDBObject.get(rootInchi);
                if (rootDBObject != null) {
                    BasicDBObject rootXref = (BasicDBObject) rootDBObject.get("xref");
                    BasicDBObject rootBing = (BasicDBObject) rootXref.get("BING");
                    BasicDBObject rootMetadata = (BasicDBObject) rootBing.get("metadata");

                    String bestNameForRootMolecule = rootMetadata.get("best_name").toString();
                    row.put(ConditionalReachabilityHeaderFields.ROOT_MOLECULE_BEST_NAME.name(),
                            bestNameForRootMolecule.equals("") ? rootInchi : bestNameForRootMolecule);

                    row.put(ConditionalReachabilityHeaderFields.TOTAL_COUNT_SEARCH_RESULTS_ROOT.name(),
                            rootMetadata.get("total_count_search_results").toString());
                } else {
                    row.put(ConditionalReachabilityHeaderFields.ROOT_MOLECULE_BEST_NAME.name(), rootInchi);
                    row.put(ConditionalReachabilityHeaderFields.TOTAL_COUNT_SEARCH_RESULTS_ROOT.name(),
                            DEFAULT_COUNT.toString());
                }
                row.put(ConditionalReachabilityHeaderFields.DEPTH.name(),
                        depthOfPathFromRootToMolecule.get(descendantInchi).toString());

                tsvWriter.append(row);
                tsvWriter.flush();
                counter++;
            }

            LOGGER.info("Wrote %d rows to %s", counter, outputPath);
        }
    }
}