net.riezebos.thoth.content.search.Indexer.java Source code

Java tutorial

Introduction

Here is the source code for net.riezebos.thoth.content.search.Indexer.java

Source

/* Copyright (c) 2016 W.T.J. Riezebos
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.riezebos.thoth.content.search;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import net.riezebos.thoth.beans.BookmarkUsage;
import net.riezebos.thoth.beans.ContentNode;
import net.riezebos.thoth.beans.MarkDownDocument;
import net.riezebos.thoth.configuration.CacheManager;
import net.riezebos.thoth.configuration.Configuration;
import net.riezebos.thoth.content.ContentManager;
import net.riezebos.thoth.exceptions.CachemanagerException;
import net.riezebos.thoth.exceptions.ContentManagerException;
import net.riezebos.thoth.exceptions.ContextNotFoundException;
import net.riezebos.thoth.markdown.critics.CriticProcessingMode;
import net.riezebos.thoth.markdown.filehandle.FileHandle;
import net.riezebos.thoth.markdown.util.DocumentNode;
import net.riezebos.thoth.markdown.util.ProcessorError;
import net.riezebos.thoth.util.ThothUtil;

public class Indexer {
    public static final String INDEX_CONTENTS = "contents";
    public static final String INDEX_TYPE = "type";
    public static final String INDEX_TITLE = "title";
    public static final String INDEX_PATH = "path";
    public static final String INDEX_USED = "used";
    public static final String INDEX_EXTENSION = "ext";
    public static final String INDEX_MODIFIED = "modified";

    public static final String TYPE_DOCUMENT = "document";
    public static final String TYPE_FRAGMENT = "fragment";
    public static final String TYPE_OTHER = "other";

    private static final Logger LOG = LoggerFactory.getLogger(Indexer.class);

    private String indexFolder;
    private FileHandle libraryFolder;
    private boolean recreate = true;
    private ContentManager contentManager;
    private Set<String> extensions = new HashSet<>();
    private static Set<String> activeIndexers = new HashSet<>();

    public Indexer(ContentManager contentManager) throws ContextNotFoundException, ContentManagerException {
        this.contentManager = contentManager;
        indexFolder = contentManager.getIndexFolder();
        libraryFolder = contentManager.getFileHandle(contentManager.getLibraryRoot());
        setIndexExtensions(getConfiguration().getIndexExtensions());
    }

    public void index() throws ContentManagerException {

        String contextName = contentManager.getContextName();
        synchronized (activeIndexers) {
            if (activeIndexers.contains(contextName)) {
                LOG.warn("Indexer for context " + contextName
                        + " is already (still?) active. Not starting a new index operation");
                return;
            }
            activeIndexers.add(contextName);
        }

        try {
            Date start = new Date();
            LOG.info("Indexing " + contextName + " to directory '" + indexFolder + "'...");

            IndexWriter writer = getWriter(recreate);
            IndexingContext indexingContext = new IndexingContext();
            indexDirectory(writer, libraryFolder, indexingContext);

            sortIndexLists(indexingContext.getIndirectReverseIndex());
            sortIndexLists(indexingContext.getDirectReverseIndex());
            cacheResults(indexingContext);

            // NOTE: if you want to maximize search performance,
            // you can optionally call forceMerge here. This can be
            // a terribly costly operation, so generally it's only
            // worth it when your index is relatively static (ie
            // you're done adding documents to it):
            //
            writer.forceMerge(1);

            writer.close();

            markUnusedDocuments(indexingContext.getDirectReverseIndex());

            Date end = new Date();
            LOG.info("Indexing context " + contextName + " took " + (end.getTime() - start.getTime())
                    + " milliseconds");
        } catch (IOException e) {
            throw new ContentManagerException(e);
        } finally {
            synchronized (activeIndexers) {
                activeIndexers.remove(contextName);
            }
        }
    }

    protected void markUnusedDocuments(Map<String, List<String>> directReverseIndex)
            throws IOException, ContentManagerException {

        String indexFolder = contentManager.getIndexFolder();

        try (IndexWriter writer = getWriter(false); IndexReader reader = getIndexReader(indexFolder)) {
            IndexSearcher searcher = getIndexSearcher(reader);
            for (ContentNode node : contentManager.getUnusedFragments()) {
                TermQuery query = new TermQuery(new Term(Indexer.INDEX_PATH, node.getPath()));

                TopDocs results = searcher.search(query, 10, Sort.RELEVANCE);
                ScoreDoc[] hits = results.scoreDocs;

                for (ScoreDoc scoreDoc : hits) {
                    Document document = searcher.doc(scoreDoc.doc);
                    document.add(new TextField(INDEX_USED, "false", Store.YES));
                    writer.updateDocument(new Term(INDEX_PATH, node.getPath()), document);
                }
            }
        }
    }

    protected void cacheResults(IndexingContext indexingContext) throws CachemanagerException {
        persistCaches(indexingContext);

        CacheManager cacheManager = getCacheManager();
        cacheManager.cacheReverseIndex(true, indexingContext.getIndirectReverseIndex());
        cacheManager.cacheReverseIndex(false, indexingContext.getDirectReverseIndex());
        List<ProcessorError> errors = new ArrayList<>(indexingContext.getErrors());
        Collections.sort(errors);
        cacheManager.cacheErrors(errors);
    }

    protected void persistCaches(IndexingContext indexingContext) throws CachemanagerException {
        getCacheManager().persistIndexingContext(indexingContext);
    }

    protected CacheManager getCacheManager() {
        return contentManager.getCacheManager();
    }

    protected Configuration getConfiguration() {
        return contentManager.getConfiguration();
    }

    protected void sortIndexLists(Map<String, List<String>> map) {
        map.entrySet().stream().forEach(entry -> Collections.sort(entry.getValue()));
    }

    protected void indexDirectory(IndexWriter writer, FileHandle fileHandle, IndexingContext context)
            throws IOException, ContextNotFoundException {

        if (fileHandle.isDirectory()) {
            for (FileHandle child : fileHandle.listFiles()) {
                if (child.isFile())
                    indexFile(writer, child, context);
                else
                    indexDirectory(writer, child, context);
            }
        } else {
            indexFile(writer, fileHandle, context);
        }
    }

    protected void indexFile(IndexWriter writer, FileHandle fileHandle, IndexingContext indexingContext)
            throws IOException, ContextNotFoundException {

        if (!ignore(fileHandle.getAbsolutePath())) {

            try {
                String resourcePath = fileHandle.getAbsolutePath();
                MarkDownDocument markDownDocument = contentManager.getMarkDownDocument(resourcePath, true,
                        CriticProcessingMode.DO_NOTHING);

                indexingContext.getErrors().addAll(markDownDocument.getErrors());

                // Validate any bookmarks that reference external files
                validateExternalBookmarks(fileHandle.getParentFile(), markDownDocument.getExternalBookmarkUsages(),
                        indexingContext.getErrors());

                // Also index non-documents if referenced and stored locally
                for (DocumentNode node : markDownDocument.getDocumentStructure().flatten(true)) {
                    String path = node.getPath();
                    boolean ignore = ignore(path);
                    if (ignore && !indexingContext.getReferencedLocalResources().contains(path)) {
                        indexingContext.getReferencedLocalResources().add(path);
                        String body = node.getDescription().trim();
                        String tokenized = body.replaceAll("\\W", " ").replaceAll("  ", "");
                        if (!body.equals(tokenized))
                            body = body + " " + tokenized;
                        addToIndex(writer, path, TYPE_OTHER, node.getFileName(), body, new HashMap<>());
                    }
                }

                updateReverseIndex(indexingContext.getIndirectReverseIndex(), true, markDownDocument);
                updateReverseIndex(indexingContext.getDirectReverseIndex(), false, markDownDocument);

                boolean isBook = getConfiguration().isBook(resourcePath);
                boolean isFragment = getConfiguration().isFragment(resourcePath);

                // To avoid finding fragments that do not really contain the search criteria, do not index included content.
                // The only exception is the book. That one we fully index
                String resourceType = isBook ? TYPE_DOCUMENT : isFragment ? TYPE_FRAGMENT : TYPE_OTHER;
                String body = isBook ? markDownDocument.getMarkdown()
                        : ThothUtil.readInputStream(fileHandle.getInputStream());
                addToIndex(writer, resourcePath, resourceType, markDownDocument.getTitle(), body,
                        markDownDocument.getMetatags());
            } catch (Exception e) {
                LOG.error(e.getMessage(), e);
            }
        }
    }

    protected void validateExternalBookmarks(FileHandle documentFolder, List<BookmarkUsage> bookmarkUsages,
            Set<ProcessorError> errors) throws IOException, ContextNotFoundException {
        for (BookmarkUsage bookmarkUsage : bookmarkUsages) {
            String rootFolder = documentFolder.getAbsolutePath();
            String externalRefSpec = ThothUtil.suffix(rootFolder, "/")
                    + ThothUtil.stripPrefix(bookmarkUsage.getBookmark(), "/");
            String externalFile = ThothUtil.getPartBeforeFirst(externalRefSpec, "#");
            String externalBookmark = ThothUtil.getPartAfterFirst(externalRefSpec, "#");

            MarkDownDocument referencedDocument = contentManager.getMarkDownDocument(externalFile, true,
                    CriticProcessingMode.DO_NOTHING);
            Set<String> validBookmarks = new HashSet<>();
            referencedDocument.getBookmarks().stream().forEach(bm -> {
                validBookmarks.add(bm.getId());
                validBookmarks.add(ThothUtil.stripNumericPrefix(bm.getId()));
            });

            if (!validBookmarks.contains(externalBookmark)) {
                ProcessorError error = new ProcessorError(bookmarkUsage.getCurrentLineInfo(),
                        "Bookmark #" + externalBookmark + " is not defined in file " + externalFile);
                errors.add(error);
            }
        }
    }

    protected void addToIndex(IndexWriter writer, String resourcePath, String resourceType, String title,
            String contents, Map<String, String> metaTags) throws IOException {
        String extension = ThothUtil.getExtension(resourcePath);
        if (extension == null)
            extension = "";
        extension = extension.toLowerCase();

        Document document = new Document();
        document.add(new StringField(INDEX_PATH, resourcePath, Field.Store.YES));
        document.add(new TextField(INDEX_TYPE, resourceType, Store.YES));
        document.add(new TextField(INDEX_TITLE, title, Store.YES));
        document.add(new TextField(INDEX_CONTENTS, contents, Store.NO));
        document.add(new TextField(INDEX_USED, "true", Store.NO));
        document.add(new TextField(INDEX_EXTENSION, extension.toLowerCase(), Store.NO));

        metaTags.entrySet().stream().forEach(entry -> document
                .add(new TextField(entry.getKey().toLowerCase(), String.valueOf(entry.getValue()), Store.NO)));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            LOG.debug("Indexer for context " + contentManager.getContextName() + " added " + resourcePath);
            writer.addDocument(document);
        } else {
            // Existing index (an old copy of this document may have been indexed) so
            // we use updateDocument instead to replace the old one matching the exact
            // path, if present:
            LOG.debug("Indexer for context " + contentManager.getContextName() + " updated " + resourcePath);
            writer.updateDocument(new Term(INDEX_PATH, resourcePath), document);
        }
    }

    protected void updateReverseIndex(Map<String, List<String>> index, boolean indirect,
            MarkDownDocument markDownDocument) {
        DocumentNode root = markDownDocument.getDocumentStructure();
        visit(root, index, indirect, new Stack<DocumentNode>());
    }

    protected List<String> getUsageList(DocumentNode node, Map<String, List<String>> reverseIndex) {
        List<String> list = reverseIndex.get(node.getPath());
        if (list == null) {
            list = new ArrayList<>();
            reverseIndex.put(node.getPath(), list);
        }
        return list;
    }

    protected void visit(DocumentNode root, Map<String, List<String>> reverseIndex, boolean indirect,
            Stack<DocumentNode> stack) {

        if (indirect) {
            for (DocumentNode node : stack) {
                String path = node.getPath();
                addPath(getUsageList(root, reverseIndex), path);
            }
        } else if (!stack.isEmpty()) {
            String path = stack.peek().getPath();
            addPath(getUsageList(root, reverseIndex), path);
        }

        stack.push(root);
        for (DocumentNode child : root.getChildren())
            visit(child, reverseIndex, indirect, stack);
        stack.pop();
    }

    protected void addPath(List<String> usageList, String path) {
        if (!usageList.contains(path))
            usageList.add(path);
    }

    public void setIndexExtensions(String extentions) {
        String[] exts = extentions.split("\\,");
        for (String ext : exts)
            extensions.add(ext.trim().toLowerCase());
    }

    protected boolean ignore(String pathName) {
        if (pathName.startsWith("."))
            return true;
        int idx = pathName.lastIndexOf('.');
        if (idx == -1)
            return false;

        String extension = pathName.substring(idx + 1).toLowerCase();
        return !extensions.contains(extension);
    }

    /**
     * Get the actual implementation (Lucene) of the IndexSearcher
     *
     * @param reader
     * @return
     */
    protected IndexSearcher getIndexSearcher(IndexReader reader) {
        return new IndexSearcher(reader);
    }

    /**
     * Get the actual implementation of the DirectoryReader
     *
     * @param indexFolder
     * @return
     * @throws IOException
     */
    protected IndexReader getIndexReader(String indexFolder) throws IOException {
        return DirectoryReader.open(FSDirectory.open(Paths.get(indexFolder)));
    }

    /**
     * Get the actual implementation of the indexWriter
     *
     * @param wipeIndex
     * @return
     * @throws IOException
     */
    protected IndexWriter getWriter(boolean wipeIndex) throws IOException {
        Directory dir = FSDirectory.open(Paths.get(indexFolder));
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        if (wipeIndex) {
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        return writer;
    }
}