Java tutorial
/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.bing; import act.server.MongoDB; import com.act.utils.TSVWriter; import com.mongodb.BasicDBObject; import com.mongodb.DBCursor; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang3.StringUtils; import org.apache.http.client.utils.URIBuilder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * This module provide a command line interface to update and export Bing Search results and ranks from the Installer * database. It supports two types of input: raw list of InChI and TSV file with an InChI header. * Usage (raw input): * sbt 'runMain act.installer.bing.BingSearchRanker * -i MNT_SHARED_DATA/Thomas/bing_ranker/l2chemicalsProductFiltered.txt * -o MNT_SHARED_DATA/Thomas/bing_ranker/l2chemicalsProductFiltered_BingSearchRanker_results.tsv' * Usage (TSV input): * sbt 'runMain act.installer.bing.BingSearchRanker * -i MNT_SHARED_DATA/Thomas/bing_ranker/benzene_search_results_wikipedia_20160617T1723.txt.hits * -o MNT_SHARED_DATA/Thomas/bing_ranker/benzene_search_results_wikipedia_BingSearchRanker_results.tsv' * -t * Usage (TSV input & all extra options, including force update): * sbt 'runMain act.installer.bing.BingSearchRanker * -i MNT_SHARED_DATA/Thomas/bing_ranker/benzene_search_results_wikipedia_20160617T1723.txt.hits * -o MNT_SHARED_DATA/Thomas/bing_ranker/benzene_search_results_wikipedia_BingSearchRanker_results.tsv' * -t -c -w -u -f */ public class BingSearchRanker { private static final Logger LOGGER = LogManager.getFormatterLogger(BingSearchRanker.class); private static final String EMPTY_STRING = ""; // Default configuration for the Installer database public static final String DEFAULT_HOST = "localhost"; public static final int DEFAULT_PORT = 27017; public static final String DEFAULT_INSTALLER_DATABASE = "actv01"; // Configuration for usage explorer UI public static final String HOST_USAGE_EXPLORER = "usage-explorer"; public static final int PORT_USAGE_EXPLORER = 8080; // Define options for CLI public static final String OPTION_INPUT_FILEPATH = "i"; public static final String OPTION_OUTPUT_FILEPATH = "o"; public static final String OPTION_TSV_INPUT = "t"; public static final String OPTION_FORCE_UPDATE = "f"; public static final String OPTION_INCLUDE_CHEBI_APPLICATIONS = "c"; public static final String OPTION_INCLUDE_WIKIPEDIA_URL = "w"; public static final String OPTION_INCLUDE_USAGE_EXPLORER_URL = "u"; // Other static variables public static final Integer DEFAULT_COUNT = 0; private static final Integer INCHI_CHUNK_SIZE = 10000; public static final String HELP_MESSAGE = StringUtils.join(new String[] { "This class adds Bing Search results for a list of molecules in the Installer (actv01) database", "and exports the results in a TSV format for easy import in Google spreadsheets.", "It supports two different input formats: raw list of InChI strings and TSV file with an InChI column.", "Default input format (with only options -i and -o) is raw list of InChI." }, " "); public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() { { add(Option.builder(OPTION_INPUT_FILEPATH).argName("INPUT_FILEPATH") .desc("The full path to the input file").hasArg().required().longOpt("input_filepath") .type(String.class)); add(Option.builder(OPTION_OUTPUT_FILEPATH).argName("OUTPUT_PATH") .desc("The full path where to write the output.").hasArg().required().longOpt("output_path") .type(String.class)); add(Option.builder(OPTION_TSV_INPUT).argName("TSV_INPUT") .desc("Whether the input is a TSV file with an InChI column.").longOpt("tsv") .type(boolean.class)); add(Option.builder(OPTION_FORCE_UPDATE).argName("FORCE_UPDATE") .desc("Whether exisitng BING cross-references in the Installer database should be overwritten.") .longOpt("force_update").type(boolean.class)); add(Option.builder(OPTION_INCLUDE_CHEBI_APPLICATIONS).argName("INCLUDE_CHEBI_APPLICATIONS") .desc("Whether to include (when applicable) ChEBI applications in the output file.") .longOpt("include_chebi").type(boolean.class)); add(Option.builder(OPTION_INCLUDE_WIKIPEDIA_URL).argName("INCLUDE_WIKIPEDIA_URL") .desc("Whether to include (when applicable) the Wikipedia URL in the output file.") .longOpt("include_wikipedia").type(boolean.class)); add(Option.builder(OPTION_INCLUDE_USAGE_EXPLORER_URL).argName("INCLUDE_USAGE_EXPLORER_URL") .desc("Whether to include (when applicable) the usage explorer UI URL in the output file.") .longOpt("include_usage").type(boolean.class)); add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help")); } }; public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } public enum BingRankerHeaderFields { INCHI, BEST_NAME, TOTAL_COUNT_SEARCH_RESULTS, ALL_NAMES, WIKIPEDIA_URL, CHEBI_MAIN_APPLICATIONS, CHEBI_DIRECT_APPLICATIONS, USAGE_EXPLORER_URL } public enum ConditionalReachabilityHeaderFields { DEPTH, ROOT_MOLECULE_BEST_NAME, ROOT_INCHI, TOTAL_COUNT_SEARCH_RESULTS_ROOT } // Instance variables private MongoDB mongoDB; private BingSearcher bingSearcher; private Boolean includeChebiApplications; private Boolean includeWikipediaUrl; private Boolean includeUsageExplorerUrl; public BingSearchRanker() { this(false, false, false, false); } public BingSearchRanker(Boolean includeChebiApplications, Boolean includeWikipediaUrl, Boolean includeUsageExplorerUrl, Boolean forceUpdate) { this.mongoDB = new MongoDB(DEFAULT_HOST, DEFAULT_PORT, DEFAULT_INSTALLER_DATABASE); this.bingSearcher = new BingSearcher(this.mongoDB, forceUpdate, false); this.includeChebiApplications = includeChebiApplications; this.includeWikipediaUrl = includeWikipediaUrl; this.includeUsageExplorerUrl = includeUsageExplorerUrl; } public static void main(final String[] args) throws Exception { // Parse the command line options Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(BingSearchRanker.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(BingSearchRanker.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } String inputPath = cl.getOptionValue(OPTION_INPUT_FILEPATH); String outputPath = cl.getOptionValue(OPTION_OUTPUT_FILEPATH); Boolean isTSVInput = cl.hasOption(OPTION_TSV_INPUT); // Read the molecule corpus LOGGER.info("Reading the input molecule corpus"); MoleculeCorpus moleculeCorpus = new MoleculeCorpus(); if (isTSVInput) { LOGGER.info("Input format is TSV"); moleculeCorpus.buildCorpusFromTSVFile(inputPath); } else { LOGGER.info("Input format is raw InChIs"); moleculeCorpus.buildCorpusFromRawInchis(inputPath); } // Get the inchi set Set<String> inchis = moleculeCorpus.getMolecules(); LOGGER.info("Found %d molecules in the input corpus", inchis.size()); // Update the Bing Search results in the Installer database BingSearchRanker bingSearchRanker = new BingSearchRanker(cl.hasOption(OPTION_INCLUDE_CHEBI_APPLICATIONS), cl.hasOption(OPTION_INCLUDE_WIKIPEDIA_URL), cl.hasOption(OPTION_INCLUDE_USAGE_EXPLORER_URL), cl.hasOption(OPTION_FORCE_UPDATE)); LOGGER.info("Updating the Bing Search results in the Installer database"); bingSearchRanker.addBingSearchResults(inchis); LOGGER.info("Done updating the Bing Search results"); // Write the results in a TSV file LOGGER.info("Writing results to output file"); bingSearchRanker.writeBingSearchRanksAsTSV(inchis, outputPath); LOGGER.info("Bing Search ranker is done. \"I'm tired, boss.\""); } /** * This function constructs the Usage Explorer URL for TSV export * @param inchi the InChI string representation of the molecule * @return a String with the link to access the Usage Explorer app. */ public String getUsageExplorerURLStringFromInchi(String inchi) { try { URI uri = new URIBuilder().setScheme("http").setHost(HOST_USAGE_EXPLORER).setPort(PORT_USAGE_EXPLORER) .setParameter("inchi", inchi).build(); return uri.toString(); } catch (URISyntaxException e) { LOGGER.error("An error occurred when trying to build the Usage Explorer URI", e); } return null; } /** * This function add the Bing Search results to the installer database from a set of InChI strings * @param inchis set of InChI string representations */ public void addBingSearchResults(Set<String> inchis) throws IOException { bingSearcher.addBingSearchResultsForInchiSet(inchis); } /** * Add InChI, names and usage information related headers to a list of header fields. * @param headerFields List of headers to be populated */ private void addChemicalHeaders(List<String> headerFields) { headerFields.add(BingRankerHeaderFields.INCHI.name()); headerFields.add(BingRankerHeaderFields.BEST_NAME.name()); headerFields.add(BingRankerHeaderFields.TOTAL_COUNT_SEARCH_RESULTS.name()); headerFields.add(BingRankerHeaderFields.ALL_NAMES.name()); if (includeChebiApplications) { headerFields.add(BingRankerHeaderFields.CHEBI_MAIN_APPLICATIONS.name()); headerFields.add(BingRankerHeaderFields.CHEBI_DIRECT_APPLICATIONS.name()); } if (includeWikipediaUrl) { headerFields.add(BingRankerHeaderFields.WIKIPEDIA_URL.name()); } if (includeUsageExplorerUrl) { headerFields.add(BingRankerHeaderFields.USAGE_EXPLORER_URL.name()); } } /** * Updates a TSV row (actually a Map from header to value) with InChI, names and usage information. * @param o BasicDBObject containing InChI, and xrefs.{BING, CHEBI, WIKIPEDIA} info * @param row TSV row (map from TSV header to value) to be updated */ private void updateRowWithChemicalInformation(BasicDBObject o, Map<String, String> row) { String inchi = o.get("InChI").toString(); row.put(BingRankerHeaderFields.INCHI.name(), inchi); BasicDBObject xref = (BasicDBObject) o.get("xref"); BasicDBObject bing = (BasicDBObject) xref.get("BING"); BasicDBObject bingMetadata = (BasicDBObject) bing.get("metadata"); row.put(BingRankerHeaderFields.BEST_NAME.name(), bingMetadata.get("best_name").toString()); row.put(BingRankerHeaderFields.TOTAL_COUNT_SEARCH_RESULTS.name(), bingMetadata.get("total_count_search_results").toString()); NamesOfMolecule namesOfMolecule = mongoDB.getNamesFromBasicDBObject(o); Set<String> names = namesOfMolecule.getAllNames(); row.put(BingRankerHeaderFields.ALL_NAMES.name(), names.toString()); if (includeChebiApplications) { BasicDBObject chebi = (BasicDBObject) xref.get("CHEBI"); if (chebi != null) { BasicDBObject chebiMetadata = (BasicDBObject) chebi.get("metadata"); BasicDBObject chebiApplications = (BasicDBObject) chebiMetadata.get("applications"); if (chebiApplications != null) { row.put(BingRankerHeaderFields.CHEBI_MAIN_APPLICATIONS.name(), chebiApplications.get("main_applications").toString()); row.put(BingRankerHeaderFields.CHEBI_DIRECT_APPLICATIONS.name(), chebiApplications.get("direct_applications").toString()); } else { LOGGER.debug("ChEBI cross-reference found, but no ChEBI applications for %s", inchi); row.put(BingRankerHeaderFields.CHEBI_MAIN_APPLICATIONS.name(), EMPTY_STRING); row.put(BingRankerHeaderFields.CHEBI_DIRECT_APPLICATIONS.name(), EMPTY_STRING); } } else { LOGGER.debug("No ChEBI cross-reference found for %s", inchi); } } if (includeWikipediaUrl) { BasicDBObject wikipedia = (BasicDBObject) xref.get("WIKIPEDIA"); if (wikipedia != null) { row.put(BingRankerHeaderFields.WIKIPEDIA_URL.name(), wikipedia.get("dbid").toString()); } else { LOGGER.debug("No Wikipedia cross-reference found for %s", inchi); row.put(BingRankerHeaderFields.WIKIPEDIA_URL.name(), EMPTY_STRING); } } if (includeUsageExplorerUrl) { row.put(BingRankerHeaderFields.USAGE_EXPLORER_URL.name(), getUsageExplorerURLStringFromInchi(inchi)); } } /** * Divide a large set of Strings into a list of smaller sets (chunks) of size `chunkSize` * @param inchis set of String (possibly representing InChIs) * @param chunkSize (Integer) the size of resulting chunks * @return inchiChunks: a list of "chunks", smaller sets of strings */ private List<Set<String>> getInchiChunks(Set<String> inchis, Integer chunkSize) { List<Set<String>> inchiChunks = new ArrayList<>(); Set<String> inchiChunk = new HashSet<>(); for (String inchi : inchis) { inchiChunk.add(inchi); if (inchiChunk.size() == chunkSize) { inchiChunks.add(inchiChunk); inchiChunk = new HashSet<>(); } } if (inchiChunk.size() > 0) { inchiChunks.add(inchiChunk); } return inchiChunks; } /** * This function writes the Bing Search ranks for a chunk of inchis in a TSV file, append only option. * @param inchis (Set<String>) set of InChI string representations * @param outputPath (String) path indicating the output file * @param appendOutput (Boolean) whether to append the results to the output file * @throws IOException */ private void writeBingSearchRanksAsTSVForInchiChunk(Set<String> inchis, String outputPath, Boolean appendOutput) throws IOException { // Define headers List<String> bingRankerHeaderFields = new ArrayList<>(); addChemicalHeaders(bingRankerHeaderFields); // Open TSV writer try (TSVWriter<String, String> tsvWriter = new TSVWriter<>(bingRankerHeaderFields)) { tsvWriter.open(new File(outputPath), appendOutput); int counter = 0; DBCursor cursor = mongoDB.fetchNamesAndUsageForInchis(inchis); // Iterate through the target chemicals while (cursor.hasNext()) { counter++; BasicDBObject o = (BasicDBObject) cursor.next(); Map<String, String> row = new HashMap<>(); updateRowWithChemicalInformation(o, row); tsvWriter.append(row); tsvWriter.flush(); } LOGGER.info("Wrote %d Bing Search results to %s", counter, outputPath); } } /** * This function writes the Bing Search ranks for a specific set of inchis in a TSV file. * @param inchis set of InChI string representations * @param outputPath path indicating the output file * @throws IOException */ public void writeBingSearchRanksAsTSV(Set<String> inchis, String outputPath) throws IOException { List<Set<String>> inchiChunks = getInchiChunks(inchis, INCHI_CHUNK_SIZE); LOGGER.info("%d chunks of maximum size %d were found!", inchiChunks.size(), INCHI_CHUNK_SIZE); if (inchiChunks.size() == 0) { LOGGER.info("No chunks found. Exiting!"); System.exit(1); } writeBingSearchRanksAsTSVForInchiChunk(inchiChunks.get(0), outputPath, false); for (int chunkIndex = 1; chunkIndex < inchiChunks.size(); chunkIndex++) { writeBingSearchRanksAsTSVForInchiChunk(inchiChunks.get(chunkIndex), outputPath, true); } } /** * This function is used to write out the conditional reachability results with data on target chemical, root chemical, * depth of steps from root to target chemical, the bing search results, all the other names associated with the target * and inchi of the target in a tsv file. This function is not scalable since it has to have an in-memory representation * of the target and root molecule's bing results to input the data into the TSV file. * @param descendantInchiToRootInchi mapping of chemical to its root chemical in the conditional reachability tree * @param depthOfPathFromRootToMolecule Since a chemical can be associated with only one root, there is a unique mapping between * the chemical and it's depth from the root. This structure holds that information. * @param outputPath The output path of the tsv file. * @throws IOException */ public void writeBingSearchRanksAsTSVUsingConditionalReachabilityFormat(Set<String> inchisToProcess, Map<String, String> descendantInchiToRootInchi, Map<String, Integer> depthOfPathFromRootToMolecule, String outputPath) throws IOException { // Define headers List<String> bingRankerHeaderFields = new ArrayList<>(); addChemicalHeaders(bingRankerHeaderFields); bingRankerHeaderFields.add(ConditionalReachabilityHeaderFields.DEPTH.name()); bingRankerHeaderFields.add(ConditionalReachabilityHeaderFields.ROOT_MOLECULE_BEST_NAME.name()); bingRankerHeaderFields.add(ConditionalReachabilityHeaderFields.TOTAL_COUNT_SEARCH_RESULTS_ROOT.name()); bingRankerHeaderFields.add(ConditionalReachabilityHeaderFields.ROOT_INCHI.name()); LOGGER.info("The total number of inchis are: %d", inchisToProcess.size()); LOGGER.info("Creating mappings between inchi and it's DB object"); DBCursor cursor = mongoDB.fetchNamesAndUsageForInchis(inchisToProcess); // TODO: We have to do an in-memory calculation of all the inchis since we need to pair up the descendant and root // db objects. This can take up a lot of memory. Map<String, BasicDBObject> inchiToDBObject = new HashMap<>(); int cursorCounter = 0; while (cursor.hasNext()) { cursorCounter++; BasicDBObject o = (BasicDBObject) cursor.next(); String inchi = o.get("InChI").toString(); if (inchi == null) { LOGGER.error("Inchi could not be parsed."); continue; } inchiToDBObject.put(inchi, o); } LOGGER.info("The total number of inchis found in the db is: %d", cursorCounter); LOGGER.info("Going to write to TSV file."); try (TSVWriter<String, String> tsvWriter = new TSVWriter<>(bingRankerHeaderFields)) { tsvWriter.open(new File(outputPath)); int counter = 0; for (String descendantInchi : descendantInchiToRootInchi.keySet()) { // Add all the descendant field results BasicDBObject descendentDBObject = inchiToDBObject.get(descendantInchi); if (descendentDBObject == null) { LOGGER.info("Could not find info on inchi %s", descendantInchi); continue; } // Add all descendant molecule fields Map<String, String> row = new HashMap<>(); updateRowWithChemicalInformation(descendentDBObject, row); // Add all the root molecule fields String rootInchi = descendantInchiToRootInchi.get(descendantInchi); row.put(ConditionalReachabilityHeaderFields.ROOT_INCHI.name(), rootInchi); BasicDBObject rootDBObject = inchiToDBObject.get(rootInchi); if (rootDBObject != null) { BasicDBObject rootXref = (BasicDBObject) rootDBObject.get("xref"); BasicDBObject rootBing = (BasicDBObject) rootXref.get("BING"); BasicDBObject rootMetadata = (BasicDBObject) rootBing.get("metadata"); String bestNameForRootMolecule = rootMetadata.get("best_name").toString(); row.put(ConditionalReachabilityHeaderFields.ROOT_MOLECULE_BEST_NAME.name(), bestNameForRootMolecule.equals("") ? rootInchi : bestNameForRootMolecule); row.put(ConditionalReachabilityHeaderFields.TOTAL_COUNT_SEARCH_RESULTS_ROOT.name(), rootMetadata.get("total_count_search_results").toString()); } else { row.put(ConditionalReachabilityHeaderFields.ROOT_MOLECULE_BEST_NAME.name(), rootInchi); row.put(ConditionalReachabilityHeaderFields.TOTAL_COUNT_SEARCH_RESULTS_ROOT.name(), DEFAULT_COUNT.toString()); } row.put(ConditionalReachabilityHeaderFields.DEPTH.name(), depthOfPathFromRootToMolecule.get(descendantInchi).toString()); tsvWriter.append(row); tsvWriter.flush(); counter++; } LOGGER.info("Wrote %d rows to %s", counter, outputPath); } } }