org.niord.core.message.MessageLuceneIndex.java Source code

Introduction

Here is the source code for org.niord.core.message.MessageLuceneIndex.java
Source

/*
 * Copyright 2016 Danish Maritime Authority.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.niord.core.message;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.niord.core.NiordApp;
import org.niord.core.area.Area;
import org.niord.core.area.AreaDesc;
import org.niord.core.category.Category;
import org.niord.core.category.CategoryDesc;
import org.niord.core.service.BaseService;
import org.niord.core.settings.annotation.Setting;
import org.niord.core.util.LuceneUtils;
import org.niord.core.util.TextUtils;
import org.niord.model.message.Status;
import org.slf4j.Logger;

import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import javax.annotation.Resource;
import javax.ejb.AsyncResult;
import javax.ejb.Asynchronous;
import javax.ejb.Lock;
import javax.ejb.LockType;
import javax.ejb.Schedule;
import javax.ejb.Singleton;
import javax.ejb.Startup;
import javax.ejb.Timeout;
import javax.ejb.TimerConfig;
import javax.ejb.TimerService;
import javax.inject.Inject;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Future;
import java.util.concurrent.locks.ReentrantLock;

import static org.niord.core.settings.Setting.Type.Boolean;

/**
 * A Lucene index used for free-text searching all messages.
 * <p>
 * The index will initially index all messages, and subsequently check every minute
 * for changed message to add or update in the index.
 * <p>
 * Note to self: Using "Hibernate Search" for message (as for AtoNs), was ruled out because it would
 * be too complex to index all related entities by language.
 */
@Singleton
@Lock(LockType.READ)
@Startup
@SuppressWarnings("unused")
public class MessageLuceneIndex extends BaseService {

    final static String LUCENE_ID_FIELD = "id";
    final static String LUCENE_SEARCH_FIELD = "message";
    final static String LUCENE_LAST_UPDATE = "lastUpdate";
    final static int LUCENE_MAX_INDEX_COUNT = 5000;
    final static int LUCENE_OPTIMIZE_INDEX_COUNT = 5000;
    final static int LUCENE_MAX_NUM_SEGMENTS = 4;

    @Inject
    @Setting(value = "messageIndexPath", defaultValue = "${niord.home}/message-index", description = "The message Lucene index directory")
    Path indexFolder;

    @Inject
    @Setting(value = "messageIndexDeleteOnStartup", defaultValue = "true", type = Boolean, description = "Whether the message lucene index is re-created for each restart or not")
    Boolean deleteOnStartup;

    @Inject
    @Setting(value = "messageIndexIncludeDeletedMessages", defaultValue = "false", type = Boolean, description = "Whether the message lucene index should include deleted messages or not")
    Boolean includeDeletedMessages;

    @Inject
    Logger log;

    @Resource
    TimerService timerService;

    @Inject
    MessageService messageService;

    @Inject
    NiordApp app;

    DirectoryReader reader;
    int optimizeIndexCount = 0;
    boolean allMessagesIndexed;
    private final ReentrantLock lock = new ReentrantLock();

    /**
     * Initialize the index
     */
    @PostConstruct
    private void init() {
        // Create the lucene index directory
        if (!Files.exists(indexFolder)) {
            try {
                Files.createDirectories(indexFolder);
            } catch (IOException e) {
                log.error("Error creating index dir " + indexFolder, e);
            }
        }

        // Check if we need to delete the old index on start-up
        if (deleteOnStartup) {
            try {
                deleteIndex();
            } catch (IOException e) {
                log.error("Failed re-creating the index on startup", e);
            }
        }

        // Wait 5 seconds before initializing the message index
        timerService.createSingleActionTimer(5000, new TimerConfig());
    }

    /**
     * Clean up Lucene index
     */
    @PreDestroy
    private void closeIndex() {
        closeReader();
    }

    /**
     * Called when the service starts up and then every minute to update the Lucene index
     * <p>
     * Note to self: It's tempting to use @Lock(WRITE) here. However, that would lock search access
     * to the index while it is being updated, and we really do not want that.
     */
    @Timeout
    @Schedule(persistent = false, second = "38", minute = "*/1", hour = "*")
    private int updateLuceneIndex() {
        lock.lock();
        try {
            return updateLuceneIndex(LUCENE_MAX_INDEX_COUNT);
        } finally {
            lock.unlock();
        }
    }

    /**
     * Returns the analyzer to use.
     * <p>
     * Something seems to have changed in Lucene. In earlier versions (e.g. 4.6), you could
     * use the StandardAnalyzer and quoted phrase searches including stop words (e.g. "the").<br>
     * However, in the current version of Lucene, it does not seem to work properly.
     * Use ClassicAnalyzer instead.
     *
     * @return the analyzer to use.
     */
    private Analyzer getAnalyzer() {
        return new ClassicAnalyzer();
    }

    /**
     * Returns if all messages have been indexed
     * @return if all messages have been indexed
     */
    public boolean allMessagesIndexed() {
        return allMessagesIndexed;
    }

    /**
     * Returns the language specific language field
     * @param language the language
     * @return the language specific language field
     */
    private String searchField(String language) {
        return LUCENE_SEARCH_FIELD + "_" + app.getLanguage(language);
    }

    /**
     * Returns the list of messages updated since the given date
     * @param fromDate the date after which to look for changed messages
     * @param maxCount the max number of messages to return
     * @return the updated messages
     */
    private List<Message> findUpdatedMessages(Date fromDate, int maxCount) {

        List<Message> messages = messageService.findUpdatedMessages(fromDate, maxCount);

        // The first time less that the maximum number of messages are found,
        // we flag that the indexing is complete
        if (messages.size() < maxCount) {
            allMessagesIndexed = true;
        }

        return messages;
    }

    /**
     * Adds the given message to the given document
     *
     * @param doc the document to add the message to
     * @param message the message to add
     */
    private void addMessageToDocument(Document doc, Message message) {
        // For each supported language, update a search field
        for (String language : app.getLanguages()) {
            String searchField = searchField(language);

            addPhraseSearchField(doc, searchField, message.getStatus());

            // UID
            addPhraseSearchField(doc, searchField, message.getUid());

            // Message series identifier
            addPhraseSearchField(doc, searchField, message.getShortId()); // e.g. "DK-074-14"
            if (message.getNumber() != null) {
                addPhraseSearchField(doc, searchField, String.valueOf(message.getNumber()));
            }

            // References
            message.getReferences().forEach(ref -> {
                addPhraseSearchField(doc, searchField, ref.getMessageId());
                ReferenceDesc desc = ref.getDesc(language);
                if (desc != null) {
                    addPhraseSearchField(doc, searchField, desc.getDescription());
                }
            });

            // Areas
            message.getAreas().forEach(a -> {
                for (Area area = a; area != null; area = area.getParent()) {
                    AreaDesc desc = area.getDesc(language);
                    if (desc != null) {
                        addPhraseSearchField(doc, searchField, desc.getName());
                    }
                }
            });

            // Category
            message.getCategories().forEach(category -> {
                for (Category cat = category; cat != null; cat = cat.getParent()) {
                    CategoryDesc desc = cat.getDesc(language);
                    if (desc != null) {
                        addPhraseSearchField(doc, searchField, desc.getName());
                    }
                }
            });

            // Charts
            message.getCharts().forEach(chart -> {
                addPhraseSearchField(doc, searchField, chart.getChartNumber());
                addPhraseSearchField(doc, searchField, chart.getInternationalNumber());
            });

            // Horizontal datum
            addPhraseSearchField(doc, searchField, message.getHorizontalDatum());

            // Add language specific fields
            MessageDesc msgDesc = message.getDesc(language);
            if (msgDesc != null) {
                addPhraseSearchField(doc, searchField, msgDesc.getTitle());
                addPhraseSearchField(doc, searchField, msgDesc.getVicinity());
                addPhraseSearchField(doc, searchField, TextUtils.html2txt(msgDesc.getPublication()));
                addPhraseSearchField(doc, searchField, msgDesc.getSource());
            }

            // Add message parts
            message.getParts().stream().flatMap(part -> part.getDescs().stream())
                    .filter(desc -> language.equals(desc.getLang())).forEach(desc -> {
                        addPhraseSearchField(doc, searchField, desc.getSubject());
                        addPhraseSearchField(doc, searchField, TextUtils.html2txt(desc.getDetails()));
                    });

            // Attachments
            message.getAttachments().forEach(att -> {
                AttachmentDesc desc = att.getDesc(language);
                if (desc != null) {
                    addPhraseSearchField(doc, searchField, desc.getCaption());
                }
            });

            // TODO
            // Add geometry
            //if (message.getGeometry() != null) {
            //}
        }
    }

    /**
     * Creates and returns a Lucene writer
     */
    private IndexWriter getNewWriter() throws IOException {

        IndexWriterConfig iwc = new IndexWriterConfig(getAnalyzer());
        // Add new documents to an existing index:
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);

        try {
            Directory dir = FSDirectory.open(indexFolder);
            return new IndexWriter(dir, iwc);
        } catch (IOException ex) {
            log.error("Failed to create message Lucene Index in folder " + indexFolder, ex);
            throw ex;
        }
    }

    /**
     * Returns the cached index reader, or creates one if none is defined
     * @return the shared index reader
     */
    private DirectoryReader getIndexReader() throws IOException {
        if (reader == null) {
            try {
                reader = DirectoryReader.open(FSDirectory.open(indexFolder));
            } catch (IOException ex) {
                log.error("Failed to open Lucene Index in folder " + indexFolder);
                throw ex;
            }
        }
        return reader;
    }

    /**
     * Closes the given writer
     * @param writer the writer to close
     */
    private void closeWriter(IndexWriter writer) {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException e) {
                log.warn("Error closing writer");
            }
        }
    }

    /**
     * Closes the current reader
     */
    private void closeReader() {
        if (reader != null) {
            try {
                reader.close();
            } catch (IOException e) {
                log.warn("Error closing reader");
            }
            reader = null;
        }
    }

    /**
     * Refreshes the current reader from the given writer
     *
     * @param writer the index writer
     */
    private void refreshReader(IndexWriter writer) throws IOException {
        closeReader();
        reader = DirectoryReader.open(writer, true);
    }

    /**
     * Call this to re-index the message index completely
     */
    @Asynchronous
    public Future<Integer> recreateIndexAsync() throws IOException {
        int updateCount = recreateIndex();
        return new AsyncResult<>(updateCount);
    }

    /**
     * Call this to re-index the message index completely
     */
    public int recreateIndex() throws IOException {
        lock.lock();
        try {
            // delete the old index
            deleteIndex();

            // Update all messages
            return updateLuceneIndex(Integer.MAX_VALUE);

        } finally {
            lock.unlock();
        }
    }

    /**
     * Deletes the current index
     */
    private void deleteIndex() throws IOException {
        // Delete the index
        IndexWriter writer = null;
        try {
            writer = getNewWriter();
            writer.deleteAll();
            writer.setCommitData(new HashMap<>());
            writer.commit();
        } finally {
            closeWriter(writer);
        }
    }

    /**
     * Returns the last updated time
     * @return the last updated time
     */
    private Date getLastUpdated() {
        try {
            DirectoryReader reader = getIndexReader();
            if (reader.getIndexCommit().getUserData().containsKey(LUCENE_LAST_UPDATE)) {
                return new Date(Long.valueOf(reader.getIndexCommit().getUserData().get(LUCENE_LAST_UPDATE)));
            }
        } catch (Exception e) {
            log.debug("Could not get last-updated flag from index reader");
        }
        return new Date(0);
    }

    /**
     * Sets the last updated time
     * @param date the last updated time
     */
    private void setLastUpdated(Date date, IndexWriter writer) {
        Map<String, String> userData = new HashMap<>();
        userData.put(LUCENE_LAST_UPDATE, String.valueOf(date.getTime()));
        writer.setCommitData(userData);
    }

    /**
     * Updates the Lucene index
     *
     * @param maxIndexCount max number of messages to index at a time
     * @return the number of updates
     */
    private int updateLuceneIndex(int maxIndexCount) {

        Date lastUpdated = getLastUpdated();

        long t0 = System.currentTimeMillis();
        log.debug(String.format("Indexing at most %d changed messages since %s", maxIndexCount, lastUpdated));

        IndexWriter writer = null;
        try {
            // Find all messages changed since the lastUpdated time stamp
            List<Message> updatedMessages = findUpdatedMessages(lastUpdated, maxIndexCount);
            if (updatedMessages.size() == 0) {
                return 0;
            }

            // Create a new index writer
            writer = getNewWriter();

            // Update the index with the changes
            for (Message message : updatedMessages) {
                indexMessage(writer, message);
                if (message.getUpdated().after(lastUpdated)) {
                    lastUpdated = message.getUpdated();
                }
            }

            // Update the last-updated flag
            setLastUpdated(lastUpdated, writer);

            // Commit the changes
            writer.commit();

            // Re-open the reader from the writer
            refreshReader(writer);

            // Check if we need to optimize the index
            optimizeIndexCount += updatedMessages.size();
            if (optimizeIndexCount > LUCENE_OPTIMIZE_INDEX_COUNT) {
                writer.forceMerge(LUCENE_MAX_NUM_SEGMENTS);
                optimizeIndexCount = 0;
            }

            log.info("Indexed " + updatedMessages.size() + " messages in " + (System.currentTimeMillis() - t0)
                    + " ms");

            return updatedMessages.size();
        } catch (Exception ex) {
            log.error("Error updating Lucene index: " + ex.getMessage(), ex);
            return 0;
        } finally {
            closeWriter(writer);
        }
    }

    /**
     * Indexes the given message by deleting and adding the document
     *
     * @param message the message to index
     */
    private void indexMessage(IndexWriter writer, Message message) {
        // First delete the message
        deleteMessageFromIndex(writer, message);
        // Then add the message
        if (shouldAddMessage(message)) {
            addMessageToIndex(writer, message);
        }
    }

    /**
     * By default, add all eligible messages.
     * @param message the message to check
     * @return whether to add the message to the index
     */
    private boolean shouldAddMessage(Message message) {
        return includeDeletedMessages || message.getStatus() != Status.DELETED;
    }

    /**
     * Deletes the given message from the index
     *
     * @param message the message to delete
     */
    private void deleteMessageFromIndex(IndexWriter writer, Message message) {
        try {
            Term idTerm = new Term(LUCENE_ID_FIELD, message.getId().toString());
            writer.deleteDocuments(idTerm);
        } catch (IOException e) {
            log.debug("Error deleting message " + message.getId());
        }
    }

    /**
     * Adds the given message to the index
     *
     * @param message the message to add
     */
    private void addMessageToIndex(IndexWriter writer, Message message) {
        Document doc = new Document();

        // ID field
        doc.add(new StringField(LUCENE_ID_FIELD, message.getId().toString(), Field.Store.YES));

        // Add the message specific fields
        addMessageToDocument(doc, message);

        // Add the document to the index
        try {
            writer.addDocument(doc);
        } catch (IOException ex) {
            log.error("Error adding message " + message.getId() + " to the Lucene index: " + ex.getMessage(), ex);
        }
    }

    /**
     * If the given value is not null, it is added to the search index
     *
     * @param doc the document to add the field value to
     * @param obj the value to add
     */
    private void addPhraseSearchField(Document doc, String field, Object obj) {
        if (obj != null) {
            String str = (obj instanceof String) ? (String) obj : obj.toString();
            if (StringUtils.isNotBlank(str)) {
                doc.add(new PhraseSearchLuceneField(field, str));
            }
        }
    }

    /**
     * If the given value is not null, it is added to the search index
     *
     * @param doc the document to add the field value to
     * @param obj the value to add
     * @param store the store value of the field
     */
    private void addStringSearchField(Document doc, String field, Object obj, Field.Store store) {
        if (obj != null) {
            String str = (obj instanceof String) ? (String) obj : obj.toString();
            if (StringUtils.isNotBlank(str)) {
                doc.add(new StringField(field, str, store));
            }
        }
    }

    /**
     * Performs a search in the index and returns the ids of matching messages
     *
     * @param freeTextSearch the search string
     * @param language the language to search
     * @param maxHits the max number of hits to return
     * @return the matching ids
     */
    public List<Long> searchIndex(String freeTextSearch, String language, int maxHits)
            throws IOException, ParseException {

        Query query;
        if (StringUtils.isNotBlank(freeTextSearch)) {
            // Normalize query text
            freeTextSearch = LuceneUtils.normalizeQuery(freeTextSearch);
            String field = searchField(language);

            // Create a query parser with "or" operator as the default
            QueryParser parser = new ComplexPhraseQueryParser(field, getAnalyzer());
            parser.setDefaultOperator(QueryParser.OR_OPERATOR);
            parser.setAllowLeadingWildcard(true); // NB: Expensive!
            query = parser.parse(freeTextSearch);

        } else {
            query = new MatchAllDocsQuery();
        }

        // Perform the search and collect the ids
        IndexSearcher searcher = new IndexSearcher(getIndexReader());
        TopDocs results = searcher.search(query, maxHits);

        List<Long> ids = new ArrayList<>();
        for (ScoreDoc hit : results.scoreDocs) {
            Document d = searcher.doc(hit.doc);
            ids.add(Long.valueOf(d.get(LUCENE_ID_FIELD)));
        }
        return ids;
    }

    /**
     * A Lucene field that stores positional information
     * in order to support phrase searches (quoted search terms).
     *
     * Also, the text value is normalized, i.e. accented chars are
     * replaced with non-accented versions.
     */
    private static class PhraseSearchLuceneField extends Field {

        /* Indexed, tokenized, not stored. */
        public static final FieldType TYPE_NOT_STORED = new FieldType();

        /* Indexed, tokenized, stored. */
        public static final FieldType TYPE_STORED = new FieldType();

        static {
            TYPE_NOT_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
            TYPE_NOT_STORED.setTokenized(true);
            TYPE_NOT_STORED.setStoreTermVectors(true);
            TYPE_NOT_STORED.setStoreTermVectorPositions(true);
            TYPE_NOT_STORED.freeze();

            TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
            TYPE_STORED.setTokenized(true);
            TYPE_STORED.setStored(true);
            TYPE_STORED.setStoreTermVectors(true);
            TYPE_STORED.setStoreTermVectorPositions(true);
            TYPE_STORED.freeze();
        }

        /** Creates a new TextField with String value. */
        public PhraseSearchLuceneField(String field, String value) {
            super(field, LuceneUtils.normalize(value), TYPE_NOT_STORED);
        }
    }
}