org.ala.lucene.Autocompleter.java Source code

Introduction

Here is the source code for org.ala.lucene.Autocompleter.java
Source

/* *************************************************************************
 *  Copyright (C) 2010 Atlas of Living Australia
 *  All Rights Reserved.
 * 
 *  The contents of this file are subject to the Mozilla Public
 *  License Version 1.1 (the "License"); you may not use this file
 *  except in compliance with the License. You may obtain a copy of
 *  the License at http://www.mozilla.org/MPL/
 * 
 *  Software distributed under the License is distributed on an "AS
 *  IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 *  implied. See the License for the specific language governing
 *  rights and limitations under the License.
 ***************************************************************************/

package org.ala.lucene;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.ala.dao.SolrUtils;
import org.ala.dao.TaxonConceptDao;
import org.ala.util.SpringUtils;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
//import org.apache.lucene.analysis.core.ISOLatin1AccentFilter;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
//import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.springframework.context.ApplicationContext;

/**
 * Note: not currently in use.
 * 
 * Search term auto-completer, works for single terms (so use on the last term
 * of the query).
 * <p>
 * Returns more popular terms first.
 *
 * @author Mat Mannion, M.Mannion@warwick.ac.uk
 */

/**
 * @since 1 Nov 2011
 * @author mok011
 * 
 * updated code for lucene 3.4.0.
 * 
 */
public final class Autocompleter {

    private static final String GRAMMED_WORDS_FIELD = "words";

    private static final String SOURCE_WORD_FIELD = "sourceWord";

    private static final String COUNT_FIELD = "count";

    private static final String[] ENGLISH_STOP_WORDS = { "an", "and", "are", "as", "at", "be", "but", "by", // "a", 
            "for", "if", "in", "into", "is", //  "i",
            "no", "not", "of", "on", "or", "such", //  "s",
            "that", "the", "their", "then", "there", "these", // "t",
            "they", "this", "to", "was", "will", "with" };

    private final static String INDEX_DIR_NAME = "/data/lucene/autocomplete";

    private Directory autoCompleteDirectory;

    private DirectoryReader autoCompleteReader;

    private IndexSearcher autoCompleteSearcher;

    public Autocompleter(String autoCompleteDir) throws IOException {
        this.autoCompleteDirectory = FSDirectory.open(new File(autoCompleteDir), null);
        File file = new File(autoCompleteDir);

        // Create a dummy index so that we don't get an exception further down
        if (!file.exists()) {
            FileUtils.forceMkdir(file);
            Analyzer analyzer = new StandardAnalyzer(SolrUtils.BIE_LUCENE_VERSION);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(SolrUtils.BIE_LUCENE_VERSION, analyzer);
            Directory dir = FSDirectory.open(file);
            IndexWriter iw = new IndexWriter(dir, indexWriterConfig);
            //            IndexWriter iw = new IndexWriter(file, analyzer, MaxFieldLength.UNLIMITED);
            iw.commit();
            iw.close();
        }

        reOpenReader();
    }

    public Autocompleter() throws IOException {
        this(INDEX_DIR_NAME);
    }

    public List<String> suggestTermsFor(String term, Integer maxHits) throws IOException {
        // get the top 5 terms for query
        Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, ClientUtils.escapeQueryChars(term)));
        SortField sf = new SortField(COUNT_FIELD, SortField.Type.INT, true);
        Sort sort = new Sort(sf);

        TopDocs docs = autoCompleteSearcher.search(query, null, maxHits, sort);
        List<String> suggestions = new ArrayList<String>();
        for (ScoreDoc doc : docs.scoreDocs) {
            suggestions.add(autoCompleteReader.document(doc.doc).get(SOURCE_WORD_FIELD));
        }

        return suggestions;
    }

    @SuppressWarnings("unchecked")
    public void reIndex(Directory sourceDirectory, String fieldToAutocomplete, boolean createNewIndex)
            throws CorruptIndexException, IOException {
        // build a dictionary (from the spell package)
        IndexReader sourceReader = IndexReader.open(sourceDirectory);

        LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete);

        // code from
        // org.apache.lucene.search.spell.SpellChecker.indexDictionary(
        // Dictionary)
        IndexWriter.unlock(autoCompleteDirectory);

        // use a custom analyzer so we can do EdgeNGramFiltering
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(SolrUtils.BIE_LUCENE_VERSION, new Analyzer() {
            protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
                final StandardTokenizer src = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
                TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
                result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result);
                result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result);
                result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result,
                        new CharArraySet(SolrUtils.BIE_LUCENE_VERSION,
                                new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)), true));
                result = new EdgeNGramTokenFilter(result, Side.FRONT, 1, 20);
                return new TokenStreamComponents(src, result) {
                    @Override
                    protected void setReader(final Reader reader) throws IOException {
                        super.setReader(reader);
                    }

                };
            }
            //            public TokenStream tokenStream(String fieldName, Reader reader) {
            //            TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader);
            //            
            //            result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result);
            //            result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result);
            //            //result = new ISOLatin1AccentFilter(result);
            //            result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)));
            //            result = new EdgeNGramTokenFilter(result, Side.FRONT,1, 20);
            //            
            //            return result;
            //          }
        });
        if (createNewIndex) {
            indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        } else {
            indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        }
        indexWriterConfig.setMaxBufferedDocs(150);
        IndexWriter writer = new IndexWriter(autoCompleteDirectory, indexWriterConfig);
        //        writer.setMergeFactor(300);

        // go through every word, storing the original word (incl. n-grams)
        // and the number of times it occurs
        Map<String, Integer> wordsMap = new HashMap<String, Integer>();

        Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
        while (iter.hasNext()) {
            String word = iter.next();

            int len = word.length();
            if (len < 3) {
                continue; // too short we bail but "too long" is fine...
            }

            if (wordsMap.containsKey(word)) {
                throw new IllegalStateException("This should never happen in Lucene 2.3.2");
                // wordsMap.put(word, wordsMap.get(word) + 1);
            } else {
                // use the number of documents this word appears in
                wordsMap.put(word, sourceReader.docFreq(new Term(fieldToAutocomplete, word)));
            }
        }

        for (String word : wordsMap.keySet()) {
            // ok index the word
            Document doc = new Document();
            doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
            doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.ANALYZED)); // grammed
            doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO,
                    Field.Index.NOT_ANALYZED)); // count

            writer.addDocument(doc);
        }

        sourceReader.close();

        // close writer
        writer.forceMerge(1);
        writer.close();

        // re-open our reader
        reOpenReader();
    }

    private void reOpenReader() throws CorruptIndexException, IOException {
        if (autoCompleteReader == null) {
            autoCompleteReader = DirectoryReader.open(autoCompleteDirectory);
        } else {
            //autoCompleteReader.reopen();
            DirectoryReader newReader = DirectoryReader.openIfChanged(autoCompleteReader);
            if (newReader != null)
                autoCompleteReader = newReader;
        }

        autoCompleteSearcher = new IndexSearcher(autoCompleteReader);
    }

    public static void main(String[] args) throws Exception {
        // run this to re-index from the current index, shouldn't need to do
        // this very often
        //autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null), "content");
        Autocompleter autocomplete = new Autocompleter();

        if (true) {
            ApplicationContext context = SpringUtils.getContext();
            TaxonConceptDao tcDao = (TaxonConceptDao) context.getBean(TaxonConceptDao.class);
            System.out.println("Starting re-indexing...");
            System.out.println("creating scientificName index");
            autocomplete.reIndex(FSDirectory.open(new File(INDEX_DIR_NAME), null), "scientificName", true);
            Thread.sleep(2000);
            System.out.println("creating commonName index");
            autocomplete.reIndex(FSDirectory.open(new File(INDEX_DIR_NAME), null), "commonName", false);
            System.out.println("Finished re-indexing...");
        }

        Thread.sleep(2000);
        String term = "rufus";
        System.out.println("autocompleting: " + term + " = " + autocomplete.suggestTermsFor(term, 5));
        term = "frog";
        System.out.println("autocompleting: " + term + " = " + autocomplete.suggestTermsFor(term, 5));
        // prints [steve, steven, stevens, stevenson, stevenage]
    }

}