com.twentyn.patentSearch.Searcher.java Source code

Introduction

Here is the source code for com.twentyn.patentSearch.Searcher.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentSearch;

import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class Searcher implements AutoCloseable {
    public static final Logger LOGGER = LogManager.getFormatterLogger(Searcher.class);

    private static final List<String> KEYWORDS = Collections
            .unmodifiableList(Arrays.asList("yeast", "cerevisiae", "coli", "biosynthesis", "biogenesis",
                    "anabolism", "catalysis", "ferment", "fermenter", "fermentor", "fermentation", "fermentive"));
    private static final String CLAIMS_FIELD = "claims";
    private static final int MAX_RESULTS_PER_QUERY = 100;
    // Note: this score is likely dependent on the set of keywords above.  Adjust this if KEYWORDS change.
    private static final float DEFAULT_SCORE_THRESHOLD = 0.1f;

    private List<Pair<IndexReader, IndexSearcher>> indexReadersAndSearchers = new ArrayList<>();
    private float scoreThreshold = DEFAULT_SCORE_THRESHOLD;

    private Searcher() {

    }

    private Searcher(float scoreThreshold) {
        this.scoreThreshold = scoreThreshold;
    }

    private void init(List<File> indexDirectories) throws IOException {
        for (File indexDirectory : indexDirectories) {
            LOGGER.info("Opening index dir at %s", indexDirectory.getAbsolutePath());
            Directory indexDir = FSDirectory.open(indexDirectory.toPath());
            IndexReader indexReader = DirectoryReader.open(indexDir);
            IndexSearcher searcher = new IndexSearcher(indexReader);
            // Only add to the list if both of these calls work.
            indexReadersAndSearchers.add(Pair.of(indexReader, searcher));
        }
    }

    @Override
    public void close() throws IOException {
        for (IndexReader reader : indexReadersAndSearchers.stream().map(Pair::getLeft)
                .collect(Collectors.toList())) {
            try {
                reader.close();
            } catch (IOException e) {
                LOGGER.error("Unable to close index reader, but continuing to try closing others: %s",
                        e.getMessage());
            }
        }
    }

    public static class Factory {
        private static final Factory INSTANCE = new Factory();

        private Factory() {

        }

        public static Factory getInstance() {
            return INSTANCE;
        }

        public Searcher build(File indexTopDir, float scoreThreshold) throws IOException {
            Searcher s = new Searcher(scoreThreshold);
            runInit(indexTopDir, s);
            return s;
        }

        public Searcher build(File indexTopDir) throws IOException {
            Searcher s = new Searcher();
            runInit(indexTopDir, s);
            return s;
        }

        private void runInit(File indexTopDir, Searcher s) throws IOException {
            if (!indexTopDir.isDirectory()) {
                String msg = String.format("Top level directory at %s is not a directory",
                        indexTopDir.getAbsolutePath());
                LOGGER.error(msg);
                throw new IOException(msg);
            }

            List<File> individualIndexes = Arrays.stream(indexTopDir.listFiles())
                    .filter(f -> f.getName().endsWith(".index")).collect(Collectors.toList());
            if (individualIndexes.size() == 0) {
                String msg = String.format("Top level directory at %s contains no index sub-directories",
                        indexTopDir.getAbsolutePath());
                LOGGER.error(msg);
                throw new IOException(msg);
            }

            s.init(individualIndexes);
        }
    }

    /**
     * Search for patents that contain any of the specified chemical synonyms, scored based on synonym and biosynthesis
     * keyword occurrence.  Results are filtered by score.
     * @param synonyms A list of chemical synonyms to use in the search.
     * @return A list of search results whose relevance scores are above the searcher's score threshold.
     * @throws IOException
     */
    public List<SearchResult> searchInClaims(List<String> synonyms) throws IOException {
        if (synonyms.size() == 0) {
            LOGGER.info("Not running search for no synonyms!");
            return Collections.emptyList();
        }

        // Make queries for all synonyms.
        final List<BooleanQuery> queries = makeQueries(synonyms, CLAIMS_FIELD).collect(Collectors.toList());

        // Reuse the compiled queries for all indices.
        try {
            Set<Triple<Float, String, String>> uniqueResults = indexReadersAndSearchers.stream()
                    .map(p -> runSearch(p, queries)). // Search to get per-query streams...
                    flatMap(Function.identity()). // combine all the streams into one...
                    collect(Collectors.toSet()); // and collect the merged results in a list.

            /* Uniq-ify!  It is completely reasonable for a patent to appear for multiple queries.
             * TODO: we haven't seen results appear multiple times with different scores.  We should probably unique-ify
             * on id and take the result with the best score just to be safe. */
            List<Triple<Float, String, String>> results = new ArrayList<>(uniqueResults);
            Collections.sort(results);

            return results.stream().map(t -> new SearchResult(t.getMiddle(), t.getRight(), t.getLeft()))
                    .collect(Collectors.toList());
        } catch (UncheckedIOException e) {
            throw e.getCause(); // Promote back to a regular exception for handling by the caller.
        }
    }

    // Run a set of queries over a single reader + searcher.
    private Stream<Triple<Float, String, String>> runSearch(Pair<IndexReader, IndexSearcher> readerSearcher,
            List<BooleanQuery> queries) throws UncheckedIOException {

        // With hints from http://stackoverflow.com/questions/22382453/java-8-streams-flatmap-method-example
        return queries.stream().map(q -> executeQuery(readerSearcher, q)).flatMap(Collection::stream);
    }

    // Run a single query on a single reader + searcher.
    private List<Triple<Float, String, String>> executeQuery(Pair<IndexReader, IndexSearcher> readerSearcher,
            BooleanQuery query) throws UncheckedIOException {
        TopDocs topDocs;
        try {
            topDocs = readerSearcher.getRight().search(query, MAX_RESULTS_PER_QUERY);
        } catch (IOException e) {
            LOGGER.error("Caught IO exception when trying to run search for %s: %s", query, e.getMessage());
            /* Wrap `e` in an unchecked exception to allow it to escape our call stack.  The top level function with catch
             * and rethrow it as a normal IOException. */
            throw new UncheckedIOException(e);
        }

        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        if (scoreDocs.length == 0) {
            LOGGER.debug("Search returned no results.");
            return Collections.emptyList();
        }
        // ScoreDoc just contains a score and an id.  We need to retrieve the documents' content using that id.

        /* Crux of the next bit:
         * Filter by score and convert from scoreDocs to document features.
         * No need to use `limit` here since we already had Lucene cap the result set size. */
        return Arrays.stream(scoreDocs).filter(scoreDoc -> scoreDoc.score >= scoreThreshold).map(scoreDoc -> { //
            try {
                Pair<String, String> features = this
                        .extractDocFeatures(readerSearcher.getLeft().document(scoreDoc.doc));
                // Put the score first so the natural sort order is based on score.
                return Triple.of(scoreDoc.score, features.getLeft(), features.getRight());
            } catch (IOException e) {
                // Yikes, this is v. bad.
                LOGGER.error("Caught IO exception when trying to read doc id %d: %s", scoreDoc.doc, e.getMessage());
                throw new UncheckedIOException(e); // Same as above.
            }
        }).collect(Collectors.toList());
    }

    // Just extract the id and title for now.  The id contains the patent number, and the title is enough for display.
    private Pair<String, String> extractDocFeatures(Document doc) {
        return Pair.of(doc.get("id"), doc.get("title"));
    }

    private Stream<BooleanQuery> makeQueries(List<String> synonyms, String field) {
        return synonyms.stream().filter(syn -> syn != null && !syn.isEmpty()).map(syn -> makeQuery(syn, field));
    }

    private BooleanQuery makeQuery(String synonym, String field) {
        BooleanQuery bq = new BooleanQuery();

        // Set the synonym as a required phrase query.  Phrase queries handle multi-word synonyms, but require construction.
        String queryString = synonym.trim().toLowerCase();
        String[] parts = queryString.split("\\s+");
        PhraseQuery query = new PhraseQuery();
        Arrays.stream(parts).forEach(p -> query.add(new Term(field, p)));
        bq.add(query, BooleanClause.Occur.MUST);

        // Append all keywords as optional clauses.  The more of these we find, the higher the score will be.
        KEYWORDS.forEach(term -> bq.add(new TermQuery(new Term(field, term)), BooleanClause.Occur.SHOULD));

        return bq;
    }

    public static class SearchResult {
        String id;
        String title;
        Float relevanceScore;

        public SearchResult(String id, String title, Float relevanceScore) {
            this.id = id;
            this.title = title;
            /* Relevance scores are defined by Apache Lucene, and are dependent on the structure of the query.
             * See the scoring docs at
             * https://lucene.apache.org/core/5_2_1/core/org/apache/lucene/search/package-summary.html#package_description
             * for details. */
            this.relevanceScore = relevanceScore;
        }

        public String getId() {
            return id;
        }

        public String getTitle() {
            return title;
        }

        public Float getRelevanceScore() {
            return relevanceScore;
        }
    }
}