act.installer.reachablesexplorer.PatentFinder.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.reachablesexplorer.PatentFinder.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.reachablesexplorer;

import act.installer.pubchem.PubchemSynonymType;
import com.act.utils.CLIUtil;
import com.twentyn.patentSearch.Searcher;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.mongojack.DBCursor;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class PatentFinder {
    private static final Logger LOGGER = LogManager.getFormatterLogger(PatentFinder.class);

    private static final String OPTION_DB_HOST = "H";
    private static final String OPTION_DB_PORT = "p";
    private static final String OPTION_TARGET_DB = "t";
    private static final String OPTION_TARGET_REACHABLES_COLLECTION = "c";
    private static final String OPTION_PATENT_INDEX_DIR = "i";

    // Default host. If running on a laptop, please set a SSH bridge to access speakeasy
    private static final String DEFAULT_HOST = "localhost";
    private static final Integer DEFAULT_PORT = 27017;

    // Target database and collection. We populate these with reachables
    // TODO These should all be turned into more long-term collections
    private static final String DEFAULT_TARGET_DATABASE = "wiki_reachables";
    private static final String DEFAULT_TARGET_COLLECTION = "reachablesv6_test_thomas";

    private static final String UNUSED_SEQUENCES_COLLECTION = null; // "dummy_sequences_v0"; // We won't touch these but need
    private static final String UNUSED_ASSETS_DIR = "/tmp"; // them for Loader's constructor.
    private static final String UNUSED_SOURCE_DB = null; // "dummy_source_db"; 

    // A directory of directories.  Each directory is one year's index, and ends in `.index`.
    private static final String DEFAULT_PATENT_INDEX_LOCATION = "data/patents";

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class searches for patents related to molecules in a reachables DB, and updates the reachable documents ",
            "with references to those patents.  Patents are filtered by a manually selected relevance threshold." },
            " ");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_DB_HOST).argName("DB host")
                    .desc(String.format("The database host to which to connect (default: %s)", DEFAULT_HOST))
                    .hasArg().longOpt("db-host"));
            add(Option.builder(OPTION_DB_PORT).argName("DB port")
                    .desc(String.format("The port on which to connect to the database (default: %d)", DEFAULT_PORT))
                    .hasArg().longOpt("db-port"));
            add(Option.builder(OPTION_PATENT_INDEX_DIR).argName("path").desc(String.format(
                    "A path to a directory of per-year indexes (directories) of patents ending in `.index` (default: %s)",
                    DEFAULT_PATENT_INDEX_LOCATION)).hasArg().longOpt("indexes-dir"));
            add(Option.builder(OPTION_TARGET_DB).argName("DB name")
                    .desc(String.format(
                            "The name of the DB into which to write reachable molecule documents (default: %s)",
                            DEFAULT_TARGET_DATABASE))
                    .hasArg().longOpt("dest-db-name"));
            add(Option.builder(OPTION_TARGET_REACHABLES_COLLECTION).argName("collection name").desc(String.format(
                    "The name of the collection in the dest DB to which to add patent references (default: %s)",
                    DEFAULT_TARGET_COLLECTION)).hasArg().longOpt("reachables-collection"));
        }
    };

    private static final List<PubchemSynonymType> SYNONYM_TYPE_PREFERENCE = Collections
            .unmodifiableList(Arrays.asList(PubchemSynonymType.TRIVIAL_NAME,
                    PubchemSynonymType.INTL_NONPROPRIETARY_NAME, PubchemSynonymType.DEPOSITORY_NAME, // Beware: this list can be huge.  TODO: be clever and shorten it?
                    PubchemSynonymType.DRUG_TRADE_NAME, PubchemSynonymType.IUPAC_NAME));

    public static void main(String[] args) throws Exception {
        CLIUtil cliUtil = new CLIUtil(Loader.class, HELP_MESSAGE, OPTION_BUILDERS);
        CommandLine cl = cliUtil.parseCommandLine(args);

        String host = cl.getOptionValue(OPTION_DB_HOST, DEFAULT_HOST);
        Integer port = Integer.parseInt(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_PORT.toString()));
        String targetDB = cl.getOptionValue(OPTION_TARGET_DB, DEFAULT_TARGET_DATABASE);
        String collection = cl.getOptionValue(OPTION_TARGET_REACHABLES_COLLECTION, DEFAULT_TARGET_COLLECTION);
        LOGGER.info("Connecting to %s:%d/%s, using collection %s", host, port, targetDB, collection);

        Loader loader = new Loader(host, port, UNUSED_SOURCE_DB, targetDB, collection, UNUSED_SEQUENCES_COLLECTION,
                UNUSED_ASSETS_DIR);

        File indexesTopDir = new File(cl.getOptionValue(OPTION_PATENT_INDEX_DIR, DEFAULT_PATENT_INDEX_LOCATION));
        if (!indexesTopDir.exists() || !indexesTopDir.isDirectory()) {
            cliUtil.failWithMessage("Index top-level directory at %s is not a directory",
                    indexesTopDir.getAbsolutePath());
        }

        LOGGER.info("Using index top level dir: %s", indexesTopDir.getAbsolutePath());

        PatentFinder finder = new PatentFinder();
        try (Searcher searcher = Searcher.Factory.getInstance().build(indexesTopDir)) {
            finder.run(loader, searcher);
        }
    }

    private void run(Loader loader, Searcher searcher) throws IOException {
        DBCursor<Reachable> reachableDBCursor = loader.getJacksonReachablesCollection().find();

        while (reachableDBCursor.hasNext()) {
            Reachable reachable = reachableDBCursor.next();

            SynonymData synonyms = reachable.getSynonyms();
            Set<String> preferredSynonyms = null;
            if (synonyms != null) {
                Map<PubchemSynonymType, Set<String>> pubchemSynonyms = synonyms.getPubchemSynonyms();
                /* Search for different kinds of synonyms in order of preference (where preference tries to strike a balance
                 * between verbosity and specificity).  Stop when we've found a type of synonym that is available for this
                 * molecule, and use that in the patent search. */
                for (PubchemSynonymType type : SYNONYM_TYPE_PREFERENCE) {
                    if (pubchemSynonyms.containsKey(type)) {
                        preferredSynonyms = pubchemSynonyms.get(type);
                        break;
                    }
                }
            }

            if (preferredSynonyms == null) {
                LOGGER.warn("No synonyms for molecule %s", reachable.getInchi());
                preferredSynonyms = Collections.emptySet();
            }

            List<String> allNames = new ArrayList<>(reachable.getNames());
            allNames.addAll(preferredSynonyms);

            allNames.removeIf(s -> s == null || s.length() < 3); // Eliminate potential garbage rankings for short names.
            // Note: stop words should not appear in the index, so no need to filter on terms.
            Collections.sort(allNames);

            LOGGER.info("Running query with terms: %s", StringUtils.join(allNames, ", "));

            List<Searcher.SearchResult> results = searcher.searchInClaims(allNames);

            if (results.size() > 0) {
                LOGGER.info("Results (%d) for %s:", results.size(), reachable.getPageName());
                List<PatentSummary> summaries = new ArrayList<>(results.size());
                for (Searcher.SearchResult result : results) {
                    LOGGER.info("(%.3f) %s: %s", result.getRelevanceScore(), result.getId(), result.getTitle());
                    summaries.add(new PatentSummary(result.getId(), result.getTitle(), result.getRelevanceScore()));
                }

                reachable.setPatentSummaries(summaries);
                loader.upsert(reachable);

            } else {
                LOGGER.info("No results for %s", reachable.getPageName());
            }

        }
    }
}