com.twentyn.patentSearch.DocumentSearch.java Source code

Introduction

Here is the source code for com.twentyn.patentSearch.DocumentSearch.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentSearch;

import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonView;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.core.LoggerContext;
import org.apache.logging.log4j.core.config.Configuration;
import org.apache.logging.log4j.core.config.LoggerConfig;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;

/**
 * This class implements a naive phrase searcher over a specified Lucene index.  It can also dump the contents of a
 * Lucene index (documents or terms for a given field) for diagnostic purposes.
 */
public class DocumentSearch {
    public static final Logger LOGGER = LogManager.getLogger(DocumentSearch.class);

    public static void main(String[] args) throws Exception {
        System.out.println("Starting up...");
        System.out.flush();
        Options opts = new Options();
        opts.addOption(Option.builder("x").longOpt("index").hasArg().required().desc("Path to index file to read")
                .build());
        opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());
        opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build());

        opts.addOption(Option.builder("f").longOpt("field").hasArg().desc("The indexed field to search").build());
        opts.addOption(
                Option.builder("q").longOpt("query").hasArg().desc("The query to use when searching").build());
        opts.addOption(Option.builder("l").longOpt("list-file").hasArg()
                .desc("A file containing a list of queries to run in sequence").build());
        opts.addOption(
                Option.builder("e").longOpt("enumerate").desc("Enumerate the documents in the index").build());
        opts.addOption(Option.builder("d").longOpt("dump").hasArg()
                .desc("Dump terms in the document index for a specified field").build());
        opts.addOption(
                Option.builder("o").longOpt("output").hasArg().desc("Write results JSON to this file.").build());
        opts.addOption(Option.builder("n").longOpt("inchi-field").hasArg()
                .desc("The index of the InChI field if an input TSV is specified.").build());
        opts.addOption(Option.builder("s").longOpt("synonym-field").hasArg()
                .desc("The index of the chemical synonym field if an input TSV is specified.").build());

        HelpFormatter helpFormatter = new HelpFormatter();
        CommandLineParser cmdLineParser = new DefaultParser();
        CommandLine cmdLine = null;
        try {
            cmdLine = cmdLineParser.parse(opts, args);
        } catch (ParseException e) {
            System.out.println("Caught exception when parsing command line: " + e.getMessage());
            helpFormatter.printHelp("DocumentIndexer", opts);
            System.exit(1);
        }

        if (cmdLine.hasOption("help")) {
            helpFormatter.printHelp("DocumentIndexer", opts);
            System.exit(0);
        }

        if (!(cmdLine.hasOption("enumerate") || cmdLine.hasOption("dump") || (cmdLine.hasOption("field")
                && (cmdLine.hasOption("query") || cmdLine.hasOption("list-file"))))) {
            System.out.println("Must specify one of 'enumerate', 'dump', or 'field' + {'query', 'list-file'}");
            helpFormatter.printHelp("DocumentIndexer", opts);
            System.exit(1);
        }

        if (cmdLine.hasOption("verbose")) {
            // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2
            LoggerContext ctx = (LoggerContext) LogManager.getContext(false);
            Configuration ctxConfig = ctx.getConfiguration();
            LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME);
            logConfig.setLevel(Level.DEBUG);

            ctx.updateLoggers();
            LOGGER.debug("Verbose logging enabled");
        }

        ObjectMapper objectMapper = new ObjectMapper();
        objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
        objectMapper.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);

        LOGGER.info("Opening index at " + cmdLine.getOptionValue("index"));

        try (Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath());
                IndexReader indexReader = DirectoryReader.open(indexDir);) {
            if (cmdLine.hasOption("enumerate")) {
                /* Enumerate all documents in the index.
                 * With help from
                 * http://stackoverflow.com/questions/2311845/is-it-possible-to-iterate-through-documents-stored-in-lucene-index
                 */
                for (int i = 0; i < indexReader.maxDoc(); i++) {
                    Document doc = indexReader.document(i);
                    LOGGER.info("Doc " + i + ":");
                    LOGGER.info(doc);
                }
            } else if (cmdLine.hasOption("dump")) {
                /* Dump indexed terms for a specific field.
                 * With help from http://stackoverflow.com/questions/11148036/find-list-of-terms-indexed-by-lucene */
                Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(cmdLine.getOptionValue("dump"));
                LOGGER.info("Has positions: " + terms.hasPositions());
                LOGGER.info("Has offsets:   " + terms.hasOffsets());
                LOGGER.info("Has freqs:     " + terms.hasFreqs());
                LOGGER.info("Stats:         " + terms.getStats());
                LOGGER.info(terms);
                TermsEnum termsEnum = terms.iterator();
                BytesRef br = null;
                while ((br = termsEnum.next()) != null) {
                    LOGGER.info("  " + br.utf8ToString());
                }

            } else {
                IndexSearcher searcher = new IndexSearcher(indexReader);
                String field = cmdLine.getOptionValue("field");

                List<Pair<String, String>> queries = null;
                if (cmdLine.hasOption("query")) {
                    queries = Collections.singletonList(Pair.of("", cmdLine.getOptionValue("query")));
                } else if (cmdLine.hasOption("list-file")) {
                    if (!(cmdLine.hasOption("inchi-field") && cmdLine.hasOption("synonym-field"))) {
                        LOGGER.error("Must specify both inchi-field and synonym-field when using list-file.");
                        System.exit(1);
                    }
                    Integer inchiField = Integer.parseInt(cmdLine.getOptionValue("inchi-field"));
                    Integer synonymField = Integer.parseInt(cmdLine.getOptionValue("synonym-field"));

                    queries = new LinkedList<>();
                    BufferedReader r = new BufferedReader(new FileReader(cmdLine.getOptionValue("list-file")));
                    String line;
                    while ((line = r.readLine()) != null) {
                        line = line.trim();
                        if (!line.isEmpty()) {
                            // TODO: use a proper TSV reader; this is intentionally terrible as is.
                            String[] fields = line.split("\t");
                            queries.add(Pair.of(fields[inchiField].replace("\"", ""), fields[synonymField]));
                        }
                    }
                    r.close();
                }

                if (queries == null || queries.size() == 0) {
                    LOGGER.error("Found no queries to run.");
                    return;
                }

                List<SearchResult> searchResults = new ArrayList<>(queries.size());
                for (Pair<String, String> queryPair : queries) {
                    String inchi = queryPair.getLeft();
                    String rawQueryString = queryPair.getRight();
                    /* The Lucene query parser interprets the kind of structural annotations we see in chemical entities
                     * as query directives, which is not what we want at all.  Phrase queries seem to work adequately
                     * with the analyzer we're currently using. */
                    String queryString = rawQueryString.trim().toLowerCase();
                    String[] parts = queryString.split("\\s+");
                    PhraseQuery query = new PhraseQuery();
                    for (String p : parts) {
                        query.add(new Term(field, p));
                    }
                    LOGGER.info("Running query: " + query.toString());

                    BooleanQuery bq = new BooleanQuery();
                    bq.add(query, BooleanClause.Occur.MUST);
                    bq.add(new TermQuery(new Term(field, "yeast")), BooleanClause.Occur.SHOULD);
                    bq.add(new TermQuery(new Term(field, "ferment")), BooleanClause.Occur.SHOULD);
                    bq.add(new TermQuery(new Term(field, "fermentation")), BooleanClause.Occur.SHOULD);
                    bq.add(new TermQuery(new Term(field, "fermentive")), BooleanClause.Occur.SHOULD);
                    bq.add(new TermQuery(new Term(field, "saccharomyces")), BooleanClause.Occur.SHOULD);

                    LOGGER.info("  Full query: " + bq.toString());

                    TopDocs topDocs = searcher.search(bq, 100);
                    ScoreDoc[] scoreDocs = topDocs.scoreDocs;
                    if (scoreDocs.length == 0) {
                        LOGGER.info("Search returned no results.");
                    }
                    List<ResultDocument> results = new ArrayList<>(scoreDocs.length);
                    for (int i = 0; i < scoreDocs.length; i++) {
                        ScoreDoc scoreDoc = scoreDocs[i];
                        Document doc = indexReader.document(scoreDoc.doc);
                        LOGGER.info("Doc " + i + ": " + scoreDoc.doc + ", score " + scoreDoc.score + ": "
                                + doc.get("id") + ", " + doc.get("title"));
                        results.add(new ResultDocument(scoreDoc.doc, scoreDoc.score, doc.get("title"),
                                doc.get("id"), null));
                    }
                    LOGGER.info("----- Done with query " + query.toString());
                    // TODO: reduce memory usage when not writing results to an output file.
                    searchResults.add(new SearchResult(inchi, rawQueryString, bq, results));
                }

                if (cmdLine.hasOption("output")) {
                    try (FileWriter writer = new FileWriter(cmdLine.getOptionValue("output"));) {
                        writer.write(objectMapper.writeValueAsString(searchResults));
                    }
                }
            }
        }
    }

    public static class SearchResult {
        @JsonView(DocumentSearch.class)
        @JsonProperty("inchi")
        protected String inchi;
        @JsonProperty("synonym")
        protected String synonym;
        @JsonProperty("query")
        protected String queryString;
        @JsonProperty("results")
        protected List<ResultDocument> results;

        protected SearchResult() {
        }

        public SearchResult(String inchi, String synonym, Query query, List<ResultDocument> results) {
            this.inchi = inchi;
            this.synonym = synonym;
            this.queryString = query.toString();
            this.results = results;
        }

        public String getInchi() {
            return inchi;
        }

        public String getSynonym() {
            return synonym;
        }

        public String getQueryString() {
            return queryString;
        }

        public List<ResultDocument> getResults() {
            return results;
        }
    }

    public static class ResultDocument {
        @JsonProperty("index_id")
        protected Integer index;
        @JsonProperty("score")
        protected Float score;
        @JsonProperty("title")
        protected String title;
        @JsonProperty("doc_id")
        protected String docId;
        @JsonProperty("classifier_score")
        protected Double classifierScore;

        protected ResultDocument() {
        }

        public ResultDocument(Integer indexId, Float score, String title, String docId, Double classifierScore) {
            this.index = indexId;
            this.score = score;
            this.title = title;
            this.docId = docId;
            this.classifierScore = classifierScore;
        }

        public Integer getIndex() {
            return index;
        }

        public Float getScore() {
            return score;
        }

        public String getTitle() {
            return title;
        }

        public String getDocId() {
            return docId;
        }

        public Double getClassifierScore() {
            return classifierScore;
        }

        public void setClassifierScore(Double classifierScore) {
            this.classifierScore = classifierScore;
        }
    }
}