act.installer.bing.BingSearcher.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.bing.BingSearcher.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.bing;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import com.act.utils.CLIUtil;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.mongodb.BasicDBObject;

import act.server.MongoDB;

/**
 * This class contains the main logic for installing Bing Search results in the Installer DB
 */

public class BingSearcher {

    private static final Logger LOGGER = LogManager.getFormatterLogger(BingSearcher.class);
    private static final String USAGE_TERMS_FILENAME = "usage_terms.txt";

    public static final String HELP_MESSAGE = "This class contains the main logic for installing Bing Search results in the Installer DB.";

    public static final String OPTION_DB_NAME = "n";
    public static final String OPTION_DB_PORT = "p";
    public static final String OPTION_DB_HOST = "h";
    public static final String OPTION_CACHE_ONLY = "c";

    public static final String DEFAULT_HOST = "localhost";
    public static final String DEFAULT_PORT = "27017";
    public static final String DEFAULT_DATABASE = "actv01";

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_DB_NAME).argName("db name")
                    .desc(String.format("The name of the database from which to fetch chemicals inchis and "
                            + "write Bing cross-references (default: %s). It needs to contains a 'chemicals' collection.",
                            DEFAULT_DATABASE))
                    .hasArg().longOpt("db").type(String.class));
            add(Option.builder(OPTION_DB_HOST).argName("DB host")
                    .desc(String.format("The database host to which to connect (default: %s)", DEFAULT_HOST))
                    .hasArg().longOpt("db-host"));
            add(Option.builder(OPTION_DB_PORT).argName("DB port")
                    .desc(String.format("The port on which to connect to the database (default: %s)", DEFAULT_PORT))
                    .hasArg().longOpt("db-port"));
            add(Option.builder(OPTION_CACHE_ONLY).argName("CACHE_ONLY").desc(
                    "Use the cache only. If that option is used, they will not be any queries against the Bing search API.")
                    .longOpt("cache-only").type(String.class));
        }
    };

    private MongoDB db;
    private BingSearchResults bingSearchResults;
    private Set<String> usageTerms;
    private boolean forceUpdate;
    private boolean cacheOnly;

    public static void main(String args[]) {

        CLIUtil cliUtil = new CLIUtil(BingSearcher.class, HELP_MESSAGE, OPTION_BUILDERS);
        CommandLine cl = cliUtil.parseCommandLine(args);
        MongoDB db = new MongoDB(cl.getOptionValue(OPTION_DB_HOST, DEFAULT_HOST),
                Integer.parseInt(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_PORT)),
                cl.getOptionValue(OPTION_DB_NAME, DEFAULT_DATABASE));
        BingSearcher bingSearcher = new BingSearcher(db, true, cl.hasOption(OPTION_CACHE_ONLY));
        bingSearcher.addBingSearchResultsForEntireDatabase();
    }

    public BingSearcher(MongoDB db) {
        this(db, false, false);
    }

    public BingSearcher(MongoDB db, boolean forceUpdate, boolean cacheOnly) {
        this.db = db;
        this.forceUpdate = forceUpdate;
        this.cacheOnly = cacheOnly;
        this.bingSearchResults = new BingSearchResults(true);
        // Get the usage terms
        LOGGER.debug("Getting usage terms corpus.");
        UsageTermsCorpus usageTermsCorpus = new UsageTermsCorpus(USAGE_TERMS_FILENAME);
        try {
            usageTermsCorpus.buildCorpus();
        } catch (IOException e) {
            LOGGER.error("Usage term corpus source file not found in class resources: %s", USAGE_TERMS_FILENAME);
            System.exit(1);
        }
        this.usageTerms = usageTermsCorpus.getUsageTerms();
    }

    private void addBingSearchResultsForEntireDatabase() {
        Iterator<String> it = db.getIteratorOverInchis(new BasicDBObject());
        while (it.hasNext()) {
            String inchi = it.next();
            if (inchi.contains("FAKE")) {
                continue;
            }
            try {
                addBingSearchResultsForInChI(inchi);
            } catch (IOException e) {
                LOGGER.error("Bing Search results could not be added for: %s", inchi);
            }

        }
    }

    private void addBingSearchResultsForInChI(String inchi) throws IOException {
        LOGGER.debug("Processing InChI " + inchi);
        // Fetches the names (Brenda, Metacyc, Chebi, Drugbank)
        NamesOfMolecule namesOfMolecule = db.fetchNamesFromInchi(inchi);
        if (namesOfMolecule == null) {
            LOGGER.debug("Molecule corresponding to %s was not found in the database. Skipping.", inchi);
            return;
        }
        // Chooses the best name according to Bing search results
        String bestName = bingSearchResults.findBestMoleculeName(namesOfMolecule);
        if (bestName.equals("")) {
            return;
        }

        // Get the total number of hits and the top search results
        Long totalCountSearchResults;
        Set<SearchResult> topSearchResults;
        if (cacheOnly) {
            totalCountSearchResults = bingSearchResults.getTotalCountSearchResultsFromCache(bestName);
            topSearchResults = bingSearchResults.getTopSearchResultsFromCache(bestName);
        } else {
            totalCountSearchResults = bingSearchResults.getAndCacheTotalCountSearchResults(bestName);
            topSearchResults = bingSearchResults.getAndCacheTopSearchResults(bestName);
        }
        NameSearchResults nameSearchResults = new NameSearchResults(bestName);
        nameSearchResults.setTotalCountSearchResults(totalCountSearchResults);
        nameSearchResults.setTopSearchResults(topSearchResults);

        // Intersect usage names with search results
        Set<UsageTermUrlSet> moleculeUsageTerms = new HashSet<>();
        for (String usageTerm : usageTerms) {
            UsageTermUrlSet usageTermUrlSet = new UsageTermUrlSet(usageTerm);
            usageTermUrlSet.populateUrlsFromNameSearchResults(nameSearchResults);
            if (usageTermUrlSet.getUrlSet().size() > 0) {
                moleculeUsageTerms.add(usageTermUrlSet);
            }
        }

        // Annotate the chemical with Bing Search Results
        BasicDBObject doc = db.createBingMetadataDoc(moleculeUsageTerms, totalCountSearchResults, bestName);
        db.updateChemicalWithBingSearchResults(inchi, bestName, doc);
    }

    public void addBingSearchResultsForInchiSet(Set<String> inchis) {
        Set<String> filteredInchis = inchis.stream().filter(inchi -> !inchi.contains("FAKE"))
                .collect(Collectors.toSet());
        LOGGER.info("Annotating %d chemicals with Bing Search results and usage terms.", filteredInchis.size());
        int counter = 0;
        for (String inchi : filteredInchis) {
            if (!forceUpdate && db.hasBingSearchResultsFromInchi(inchi)) {
                LOGGER.debug("Existing Bing search results found for %s. Skipping.", inchi);
                continue;
            }
            try {
                addBingSearchResultsForInChI(inchi);
            } catch (IOException e) {
                LOGGER.error("Could not add bing results for %s. Skipping.", inchi);
            }
            if (++counter % 100 == 0) {
                LOGGER.info("Added Bing Search results for %d chemicals (total %d)", counter,
                        filteredInchis.size());
            }
        }
    }
}