org.dspace.search.LuceneIndex.java Source code

Introduction

Here is the source code for org.dspace.search.LuceneIndex.java
Source

/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.search;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.lang.reflect.Constructor;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.List;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.flexible.standard.parser.TokenMgrError;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortField.Type;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.google.common.base.Strings;

import org.dspace.core.ConfigurationManager;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.core.Email;
import org.dspace.core.I18nUtil;
import org.dspace.core.LogManager;
import org.dspace.handle.HandleManager;
import org.dspace.sort.SortOption;
import org.dspace.sort.OrderFormat;

import static org.dspace.search.DSIndexer.*;

/**
 * LuceneIndex provides indexing and querying services backed by a
 * Lucene index on local disk. This has been factored out of DSIndexer.
 *
 * NB: there are several ways this service could be generalized.
 * It was implemented to offer compatibility with the old search code,
 * which had a number of limitations. For example, the 'schema'
 * (assignment of indexing attributes to fields, etc) is hard-coded
 * here (as it was in the original DSIndexer) - a more flexible
 * approach would make it file configurable as the SOLR service is.
 * Also, the threading model seems defective, if configured in
 * valid ways, collisions over IndexWriters seems possible.
 * For further analysis and refactor. Essentially bypassed for now.
 *
 * @author richardrodgers
 */
public class LuceneIndex implements IndexService {
    private static final Logger log = LoggerFactory.getLogger(LuceneIndex.class);

    private static final long WRITE_LOCK_TIMEOUT = 30000 /* 30 sec */;

    private Thread delayedIndexFlusher = null;
    private int indexFlushDelay = ConfigurationManager.getIntProperty("search", "flush.delay", -1);

    private int batchFlushAfterDocuments = ConfigurationManager.getIntProperty("search", "batch.documents", 20);

    private boolean batchProcessingMode = false;

    // search field schema - hard-coded here, but could easily be made more configurable
    private static final Map<String, FieldConfig> schema = new HashMap<String, FieldConfig>() {
        {
            put(LAST_INDEXED_FIELD,
                    new FieldConfig(LAST_INDEXED_FIELD, "text", Field.Store.YES, Field.Index.NOT_ANALYZED));
            put(DOCUMENT_STATUS_FIELD,
                    new FieldConfig(DOCUMENT_STATUS_FIELD, "text", Field.Store.YES, Field.Index.NOT_ANALYZED));
            put(DOCUMENT_KEY, new FieldConfig(DOCUMENT_KEY, "text", Field.Store.YES, Field.Index.NOT_ANALYZED));
            put("search.resourcetype",
                    new FieldConfig("search.resourcetype", "text", Field.Store.YES, Field.Index.NOT_ANALYZED));
            put("search.resourceid", new FieldConfig("search.resourceid", "text", Field.Store.YES, Field.Index.NO));
            put("name", new FieldConfig("name", "text", Field.Store.NO, Field.Index.ANALYZED));
            put("default", new FieldConfig("default", "text", Field.Store.NO, Field.Index.ANALYZED));
            put("location", new FieldConfig("location", "text", Field.Store.NO, Field.Index.ANALYZED));
            // following are templates
            put("sort_", new FieldConfig("sort_", "text", Field.Store.NO, Field.Index.NOT_ANALYZED));
        }
    };

    private String indexDirectory;

    private int maxFieldLength = -1;

    // TODO: Support for analyzers per language, or multiple indices
    /** The analyzer for this DSpace instance */
    private volatile Analyzer analyzer = null;

    // cache a Lucene IndexSearcher for more efficient searches
    private static IndexSearcher searcher = null;

    private static long lastModified;

    public LuceneIndex() {
    }

    static {
        /*
         * Increase the default write lock so that Indexing can be interrupted.
         */
        IndexWriterConfig.setDefaultWriteLockTimeout(WRITE_LOCK_TIMEOUT);
        int maxClauses = ConfigurationManager.getIntProperty("search", "max-clauses", -1);
        if (maxClauses > 0) {
            BooleanQuery.setMaxClauseCount(maxClauses);
        }
    }

    public void setBatchProcessingMode(boolean mode) {
        batchProcessingMode = mode;
        if (mode == false) {
            flushIndexingTaskQueue();
        }
    }

    /**
      * Get the Lucene analyzer to use according to current configuration (or
      * default). TODO: Should have multiple analyzers (and maybe indices?) for
      * multi-lingual DSpaces.
      *
      * @return <code>Analyzer</code> to use
      * @throws IllegalStateException
      *             if the configured analyzer can't be instantiated
      */
    Analyzer getAnalyzer() {
        if (analyzer == null) {
            // We need to find the analyzer class from the configuration
            String analyzerClassName = ConfigurationManager.getProperty("search", "analyzer.default");

            if (analyzerClassName == null) {
                // Use default
                analyzerClassName = "org.dspace.search.DSAnalyzer";
            }

            try {
                Class analyzerClass = Class.forName(analyzerClassName);
                Constructor constructor = analyzerClass.getDeclaredConstructor(Version.class);
                constructor.setAccessible(true);
                analyzer = (Analyzer) constructor.newInstance(Version.LUCENE_36);
                if (maxFieldLength > -1) {
                    analyzer = new LimitTokenCountAnalyzer(analyzer, maxFieldLength);
                }
            } catch (Exception e) {
                log.error(LogManager.getHeader(null, "no_search_analyzer", "search.analyzer=" + analyzerClassName),
                        e);

                throw new IllegalStateException(e.toString());
            }
        }

        return analyzer;
    }

    void processIndexingTask(IndexingTask task) throws IOException {
        if (batchProcessingMode) {
            addToIndexingTaskQueue(task);
        } else if (indexFlushDelay > 0) {
            addToIndexingTaskQueue(task);
            startDelayedIndexFlusher();
        } else {
            IndexWriter writer = null;
            try {
                writer = openIndex(false);
                executeIndexingTask(writer, task);
            } finally {
                //if (task.getDocument() != null)
                //{
                //    closeAllReaders(task.getDocument());
                //}

                if (writer != null) {
                    try {
                        writer.close();
                    } catch (IOException e) {
                        log.error("Unable to close IndexWriter", e);
                    }
                }
            }
        }
    }

    private static void executeIndexingTask(IndexWriter writer, IndexingTask action) throws IOException {
        if (action != null) {
            /*
             if (action.isDelete())
             {
            if (action.getDocument() != null)
            {
                writer.updateDocument(action.getTerm(), action.getDocument());
            }
            else
            {
                writer.deleteDocuments(action.getTerm());
            }
             }
             else
             {
            writer.updateDocument(action.getTerm(), action.getDocument());
             }
             */
        }
    }

    private Map<String, IndexingTask> queuedTaskMap = new HashMap<String, IndexingTask>();

    synchronized void addToIndexingTaskQueue(IndexingTask action) {
        if (action != null) {
            // queuedTaskMap.put(action.getTerm().text(), action);
            if (queuedTaskMap.size() >= batchFlushAfterDocuments) {
                flushIndexingTaskQueue();
            }
        }
    }

    void flushIndexingTaskQueue() {
        if (queuedTaskMap.size() > 0) {
            IndexWriter writer = null;

            try {
                writer = openIndex(false);
                flushIndexingTaskQueue(writer);
            } catch (IOException e) {
                log.error("Error flushing", e);
            } finally {
                if (writer != null) {
                    try {
                        writer.close();
                    } catch (IOException ex) {
                        log.error("Error closing writer", ex);
                    }
                }
            }
        }
    }

    private synchronized void flushIndexingTaskQueue(IndexWriter writer) {
        for (IndexingTask action : queuedTaskMap.values()) {
            try {
                executeIndexingTask(writer, action);
            } catch (IOException e) {
                log.error("Error indexing", e);
            }
            // finally
            // {
            //if (action.getDocument() != null)
            //{
            //    closeAllReaders(action.getDocument());
            //}
            // }
        }

        queuedTaskMap.clear();

        // We've flushed, so we don't need this thread
        if (delayedIndexFlusher != null) {
            delayedIndexFlusher.interrupt();
            delayedIndexFlusher = null;
        }
    }

    ////////////////////////////////////
    //      Private
    ////////////////////////////////////

    /**
    * Is stale checks the lastModified time stamp in the database and the index
    * to determine if the index is stale.
    *
    * @param lastModified
    * @throws SQLException
    * @throws IOException
    */
    @Override
    public boolean isDocumentStale(String documentKey, Date lastModified) throws IOException {

        boolean reindexItem = false;
        boolean inIndex = false;

        IndexReader ir = getSearcher().getIndexReader();
        Term t = new Term("handle", documentKey);
        AtomicReader ar = (AtomicReader) ir;
        DocsEnum docsE = ar.termDocsEnum(t);

        int docId;
        while ((docId = docsE.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
            inIndex = true;
            Document doc = ir.document(docId);

            IndexableField lastIndexed = doc.getField(LAST_INDEXED_FIELD);

            if (lastIndexed == null || Long.parseLong(lastIndexed.stringValue()) < lastModified.getTime()) {
                reindexItem = true;
            }
        }

        return reindexItem || !inIndex;
    }

    /**
     * prepare index, opening writer, and wiping out existing index if necessary
     */
    private IndexWriter openIndex(boolean wipeExisting) throws IOException {
        Directory dir = FSDirectory.open(new File(indexDirectory));
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, getAnalyzer());
        if (wipeExisting) {
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        } else {
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        return writer;
    }

    @Override
    public void doTask(IndexingTask task) throws IOException {

        switch (task.getAction()) {
        case DELETE:
            commit(task.getFieldValue(DOCUMENT_KEY), null, false);
            break;
        case UPDATE:
            Document doc = new Document();
            // add in fields
            for (String key : task.getFieldKeys()) {
                // get config for field
                FieldConfig fc = schema.get(key);
                if (fc != null) {
                    for (String value : task.getFieldValues(key)) {
                        mapValue(value, fc, doc);
                        // all get mapped to 'default' index
                        mapValue(value, schema.get("default"), doc);
                    }
                } else {
                    log.error("Invalid field map - field: '" + key + "' undefined in schema");
                }
            }
            // likewise any streams
            for (String key : task.getStreamKeys()) {
                for (InputStream is : task.getStreamValues(key)) {
                    doc.add(new Field("default", new BufferedReader(new InputStreamReader(is))));
                }
            }
            commit(task.getFieldValue(DOCUMENT_KEY), doc, true);
            break;
        case TX_BEGIN:
            setBatchProcessingMode(true);
            break;
        case TX_END:
            setBatchProcessingMode(false);
            break;
        case PURGE:
            openIndex(true).close();
            break;
        default:
            break;
        }
    }

    @Override
    public QueryResults doQuery(QueryArgs args) throws IOException {
        String querystring = args.getQuery();
        QueryResults qr = new QueryResults();
        List<String> hitHandles = new ArrayList<String>();
        List<Integer> hitIds = new ArrayList<Integer>();
        List<Integer> hitTypes = new ArrayList<Integer>();

        // set up the QueryResults object
        qr.setHitHandles(hitHandles);
        qr.setHitIds(hitIds);
        qr.setHitTypes(hitTypes);
        qr.setStart(args.getStart());
        qr.setPageSize(args.getPageSize());
        qr.setEtAl(args.getEtAl());

        // massage the query string a bit
        querystring = DSQuery.checkEmptyQuery(querystring); // change nulls to an empty string
        querystring = DSQuery.stripHandles(querystring); // remove handles from query string
        querystring = DSQuery.stripAsterisk(querystring); // remove asterisk from beginning of string

        try {
            // grab a searcher, and do the search
            IndexSearcher searcher = getSearcher();
            // FIXME
            QueryParser qp = new QueryParser(Version.LUCENE_36, "default", getAnalyzer());
            log.debug("Final query string: " + querystring);

            String operator = DSQuery.getOperator();
            if (operator == null || operator.equals("OR")) {
                qp.setDefaultOperator(QueryParser.OR_OPERATOR);
            } else {
                qp.setDefaultOperator(QueryParser.AND_OPERATOR);
            }

            Query myquery = qp.parse(querystring);
            //Retrieve enough docs to get all the results we need !
            TopDocs hits = performQuery(args, searcher, myquery, args.getPageSize() * (args.getStart() + 1));

            // set total number of hits
            qr.setHitCount(hits.totalHits);

            // We now have a bunch of hits - snip out a 'window'
            // defined in start, count and return the handles
            // from that window
            // first, are there enough hits?
            if (args.getStart() < hits.totalHits) {
                // get as many as we can, up to the window size
                // how many are available after snipping off at offset 'start'?
                int hitsRemaining = hits.totalHits - args.getStart();

                int hitsToProcess = (hitsRemaining < args.getPageSize()) ? hitsRemaining : args.getPageSize();

                for (int i = args.getStart(); i < (args.getStart() + hitsToProcess); i++) {
                    Document d = searcher.doc(hits.scoreDocs[i].doc);

                    String resourceId = d.get("search.resourceid");
                    String resourceType = d.get("search.resourcetype");

                    String handleText = d.get("handle");
                    String handleType = d.get("type");

                    switch (Integer.parseInt(resourceType != null ? resourceType : handleType)) {
                    case Constants.ITEM:
                        hitTypes.add(Constants.ITEM);
                        break;

                    case Constants.COLLECTION:
                        hitTypes.add(Constants.COLLECTION);
                        break;

                    case Constants.COMMUNITY:
                        hitTypes.add(Constants.COMMUNITY);
                        break;
                    }

                    hitHandles.add(handleText);
                    hitIds.add(resourceId == null ? null : Integer.parseInt(resourceId));
                }
            }
        } catch (NumberFormatException e) {
            log.warn("Number format exception", e);
            qr.setErrorMsg("number-format-exception");
        } catch (ParseException e) {
            // a parse exception - log and return null results
            log.warn("Invalid search string", e);
            qr.setErrorMsg("invalid-search-string");
        } catch (TokenMgrError tme) {
            // Similar to parse exception
            log.warn("Invalid search string", tme);
            qr.setErrorMsg("invalid-search-string");
        } catch (BooleanQuery.TooManyClauses e) {
            log.warn("Query too broad", e.toString());
            qr.setErrorMsg("query-too-broad");
        }

        return qr;
    }

    private static TopDocs performQuery(QueryArgs args, IndexSearcher searcher, Query myquery, int max)
            throws IOException {
        TopDocs hits;
        try {
            if (args.getSortOption() == null) {
                SortField[] sortFields = new SortField[] { new SortField("search.resourcetype", Type.INT, true),
                        new SortField(null, Type.SCORE, SortOption.ASCENDING.equals(args.getSortOrder())) };
                hits = searcher.search(myquery, max, new Sort(sortFields));
            } else {
                SortField[] sortFields = new SortField[] { new SortField("search.resourcetype", Type.INT, true),
                        new SortField("sort_" + args.getSortOption().getName(), Type.STRING,
                                SortOption.DESCENDING.equals(args.getSortOrder())),
                        SortField.FIELD_SCORE };
                hits = searcher.search(myquery, max, new Sort(sortFields));
            }
        } catch (Exception e) {
            // Lucene can throw an exception if it is unable to determine a sort time from the specified field
            // Provide a fall back that just works on relevancy.
            log.error("Unable to use speficied sort option: "
                    + (args.getSortOption() == null ? "type/relevance" : args.getSortOption().getName()));
            hits = searcher.search(myquery, max, new Sort(SortField.FIELD_SCORE));
        }
        return hits;
    }

    @Override
    public void init(String config) {
        indexDirectory = config;
        File indexDir = new File(indexDirectory);
        try {
            if (!DirectoryReader.indexExists(FSDirectory.open(indexDir))) {
                indexDir.mkdirs();
                openIndex(true).close();
            }
        } catch (IOException e) {
            throw new IllegalStateException("Could not create search index: " + e.getMessage(), e);
        }
        // set maxfieldlength
        maxFieldLength = ConfigurationManager.getIntProperty("search", "maxfieldlength", -1);
    }

    private void commit(String documentKey, Document doc, boolean update) throws IOException {
        IndexWriter writer = null;
        Term term = new Term(DOCUMENT_KEY, documentKey);
        try {
            writer = openIndex(false);
            if (update) {
                writer.updateDocument(term, doc);
            } else {
                writer.deleteDocuments(term);
            }
        } finally {
            if (doc != null) {
                closeAllReaders(doc);
            }
            if (writer != null) {
                try {
                    writer.close();
                } catch (IOException e) {
                    log.error("Unable to close IndexWriter", e);
                }
            }
        }
    }

    private void mapValue(String value, FieldConfig fc, Document doc) {
        if ("timestamp".equals(fc.fieldType)) {
            Date date = toDate(value);
            if (date != null) {
                doc.add(new Field(fc.fieldName, DateTools.dateToString(date, DateTools.Resolution.SECOND), fc.store,
                        fc.index));
                doc.add(new Field(fc.fieldName + ".year", DateTools.dateToString(date, DateTools.Resolution.YEAR),
                        fc.store, fc.index));
            }
        } else if ("date".equals(fc.fieldType)) {
            Date date = toDate(value);
            if (date != null) {
                doc.add(new Field(fc.fieldName, DateTools.dateToString(date, DateTools.Resolution.DAY), fc.store,
                        fc.index));
                doc.add(new Field(fc.fieldName + ".year", DateTools.dateToString(date, DateTools.Resolution.YEAR),
                        fc.store, fc.index));
            }
        } else {
            // all other cases - just add one field with untransformed value
            doc.add(new Field(fc.fieldName, value, fc.store, fc.index));
        }
    }

    private static void closeAllReaders(Document doc) {
        if (doc != null) {
            int count = 0;
            List fields = doc.getFields();
            if (fields != null) {
                for (Field field : (List<Field>) fields) {
                    Reader r = field.readerValue();
                    if (r != null) {
                        try {
                            r.close();
                            count++;
                        } catch (IOException e) {
                            log.error("Unable to close reader", e);
                        }
                    }
                }
            }

            if (count > 0) {
                log.debug("closed " + count + " readers");
            }
        }
    }

    /**
     * get an IndexSearcher, hopefully a cached one (gives much better
     * performance.) checks to see if the index has been modified - if so, it
     * creates a new IndexSearcher
     */
    protected synchronized IndexSearcher getSearcher() throws IOException {

        // If we have already opened a searcher, check to see if the index has been updated
        // If it has, we need to close the existing searcher - we will open a new one later
        Directory searchDir = FSDirectory.open(new File(indexDirectory));
        DirectoryReader idxReader = DirectoryReader.open(searchDir);//getSearcher().getIndexReader();
        if (searcher != null && lastModified != idxReader.getVersion()) { /*
                                                                            try
                                                                            {
                                                                          // Close the cached IndexSearcher
                                                                          // RLR FIXME
                                                                          //searcher.close();
                                                                            }
                                                                            catch (IOException ioe)
                                                                            {
                                                                          // Index is probably corrupt. Log the error, but continue to either:
                                                                          // 1) Return existing searcher (may yet throw exception, no worse than throwing here)
                                                                          log.warn("DSQuery: Unable to check for updated index", ioe);
                                                                            }
                                                                            finally
                                                                            {
                                                                               searcher = null;
                                                                            }
                                                                            */
        }

        // There is no existing searcher - either this is the first execution,
        // or the index has been updated and we closed the old index.
        if (searcher == null) {
            // So, open a new searcher
            lastModified = idxReader.getVersion();
            String osName = System.getProperty("os.name");
            // RLR TODO - check Read only restriction here
            IndexReader reader = IndexReader.open(searchDir);
            if (osName != null && osName.toLowerCase().contains("windows")) {
                searcher = new IndexSearcher(reader) {
                    /*
                     * TODO: Has Lucene fixed this bug yet?
                     * Lucene doesn't release read locks in
                     * windows properly on finalize. Our hack
                     * extend IndexSearcher to force close().
                     */
                    @Override
                    protected void finalize() throws Throwable {
                        //RLR FIXME
                        //this.close();
                        super.finalize();
                    }
                };
            } else {
                searcher = new IndexSearcher(reader);
            }
        }

        return searcher;
    }

    /**
     * Helper function to retrieve a date using a best guess of the potential date encodings on a field
     *
     * @param t
     * @return
     */
    private static Date toDate(String t) {

        List<String> fmts = new ArrayList<String>();
        // Choose the likely date formats based on string length
        switch (t.length()) {
        case 4:
            fmts.add("yyyy");
            break;
        case 6:
            fmts.add("yyyyMM");
            break;
        case 7:
            fmts.add("yyyy-MM");
            break;
        case 8:
            fmts.add("yyyyMMdd");
            fmts.add("yyyy MMM");
            break;
        case 10:
            fmts.add("yyyy-MM-dd");
            break;
        case 11:
            fmts.add("yyyy MM dd");
            break;
        case 20:
            fmts.add("yyyy-MM-dd'T'HH:mm:ss'Z'");
            break;
        default:
            fmts.add("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
            break;
        }

        for (String fmt : fmts) {
            try {
                // Parse the date
                DateTimeFormatter formatter = DateTimeFormat.forPattern(fmt);
                DateTime dt = formatter.parseDateTime(t);
                return dt.toDate();
            } catch (IllegalArgumentException pe) {
                log.error("Unable to parse date format", pe);
            }
        }
        return null;
    }

    private synchronized void startDelayedIndexFlusher() {
        if (delayedIndexFlusher != null && !delayedIndexFlusher.isAlive()) {
            delayedIndexFlusher = null;
        }

        if (delayedIndexFlusher == null && queuedTaskMap.size() > 0) {
            delayedIndexFlusher = new Thread(new DelayedIndexFlushThread());
            delayedIndexFlusher.start();
        }
    }

    private class DelayedIndexFlushThread implements Runnable {
        @Override
        public void run() {
            try {
                Thread.sleep(indexFlushDelay);
                flushIndexingTaskQueue();
            } catch (InterruptedException e) {
                log.debug("Delayed flush", e);
            }
        }
    }

    private static class FieldConfig {
        String fieldName;
        String fieldType;
        Field.Store store;
        Field.Index index;

        public FieldConfig(String fieldName, String fieldType, Field.Store store, Field.Index index) {
            this.fieldName = fieldName;
            this.fieldType = fieldType;
            this.store = store;
            this.index = index;
        }
    }
}