net.riezebos.thoth.content.search.Searcher.java Source code

Introduction

Here is the source code for net.riezebos.thoth.content.search.Searcher.java
Source

/* Copyright (c) 2016 W.T.J. Riezebos
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.riezebos.thoth.content.search;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import net.riezebos.thoth.beans.MarkDownDocument;
import net.riezebos.thoth.configuration.Configuration;
import net.riezebos.thoth.content.AccessManager;
import net.riezebos.thoth.content.ContentManager;
import net.riezebos.thoth.exceptions.ContentManagerException;
import net.riezebos.thoth.exceptions.ContextNotFoundException;
import net.riezebos.thoth.exceptions.SearchException;
import net.riezebos.thoth.markdown.critics.CriticProcessingMode;
import net.riezebos.thoth.markdown.util.DocumentNode;
import net.riezebos.thoth.user.Identity;
import net.riezebos.thoth.user.Permission;
import net.riezebos.thoth.util.PagedList;
import net.riezebos.thoth.util.ThothCoreUtil;
import net.riezebos.thoth.util.ThothUtil;

public class Searcher {
    private static final Logger LOG = LoggerFactory.getLogger(Searcher.class);

    private ContentManager contentManager;

    public Searcher(ContentManager contentManager) {
        this.contentManager = contentManager;
    }

    public PagedList<SearchResult> search(Identity identity, String queryExpression, int pageNumber, int pageSize)
            throws SearchException {
        try {
            IndexReader reader = getIndexReader(contentManager);
            IndexSearcher searcher = getIndexSearcher(reader);
            Analyzer analyzer = new StandardAnalyzer();

            // We might need to restrict the results to books of the user does not have access to fragments:
            AccessManager accessManager = contentManager.getAccessManager();
            boolean booksOnly = !accessManager.hasPermission(identity, "", Permission.READ_FRAGMENTS);
            if (booksOnly) {
                queryExpression = Indexer.INDEX_TYPE + ":" + Indexer.TYPE_DOCUMENT + " AND (" + queryExpression
                        + ")";
            }

            QueryParser parser = new QueryParser(Indexer.INDEX_CONTENTS, analyzer);
            Query query = parser.parse(queryExpression);

            // We add 1 to determine if there is more to be found after the current page
            int maxResults = pageSize * pageNumber + 1;
            TopDocs results = searcher.search(query, maxResults, Sort.RELEVANCE);
            ScoreDoc[] hits = results.scoreDocs;

            boolean hadMore = (hits.length == maxResults);

            List<SearchResult> searchResults = new ArrayList<>();
            int idx = 0;
            for (ScoreDoc scoreDoc : hits) {
                if (searchResults.size() == pageSize)
                    break;
                idx++;
                if (idx >= (pageNumber - 1) * pageSize) {
                    Document document = searcher.doc(scoreDoc.doc);
                    IndexableField field = document.getField(Indexer.INDEX_PATH);
                    String documentPath = field.stringValue();
                    SearchResult searchResult = new SearchResult();
                    searchResult.setIndexNumber((pageNumber - 1) * pageSize + idx);
                    searchResult.setDocument(documentPath);

                    String type = document.get(Indexer.INDEX_TYPE);
                    if (Indexer.TYPE_DOCUMENT.equals(type) || Indexer.TYPE_FRAGMENT.equals(type)) {
                        searchResult.setResource(false);

                        try {
                            MarkDownDocument markDownDocument = contentManager.getMarkDownDocument(documentPath,
                                    true, CriticProcessingMode.DO_NOTHING);
                            String contents = markDownDocument.getMarkdown();

                            SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
                            Highlighter highlighter = new Highlighter(htmlFormatter,
                                    new QueryScorer(query, Indexer.INDEX_CONTENTS));
                            highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

                            TokenStream tokenStream = analyzer.tokenStream(Indexer.INDEX_CONTENTS, contents);

                            TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, contents, false,
                                    99999);
                            for (TextFragment frag : frags) {
                                if ((frag != null) && (frag.getScore() > 0)) {
                                    String fragmentText = frag.toString();
                                    searchResult.addFragment(
                                            new Fragment(ThothCoreUtil.escapeHtmlExcept("B", fragmentText)));
                                }
                            }
                        } catch (FileNotFoundException e) {
                            LOG.warn(
                                    "Index contains an invalid file reference); probably need to reindex to get rid of this. File: "
                                            + e.getMessage());
                        }
                    } else {
                        searchResult.setResource(true);
                        String extension = ThothUtil.getExtension(documentPath);
                        searchResult.setImage(getConfiguration().isImageExtension(extension));

                        searchResult.addFragment(new Fragment(document.get(Indexer.INDEX_TITLE)));
                    }
                    searchResults.add(searchResult);
                }
            }
            reader.close();
            linkBooks(searchResults);
            PagedList<SearchResult> pagedList = new PagedList<>(searchResults, hadMore);
            return pagedList;
        } catch (Exception e) {
            throw new SearchException(e);
        }
    }

    protected IndexSearcher getIndexSearcher(IndexReader reader) {
        IndexSearcher searcher = new IndexSearcher(reader);
        return searcher;
    }

    protected IndexReader getIndexReader(ContentManager contentManager)
            throws ContextNotFoundException, IOException {
        String indexFolder = contentManager.getIndexFolder();
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFolder)));
        return reader;
    }

    protected void linkBooks(List<SearchResult> searchResults) throws ContentManagerException {

        Map<String, List<String>> reverseIndexIndirect = contentManager.getReverseIndex(true);

        List<String> bookExtensions = getConfiguration().getBookExtensions();
        for (SearchResult searchResult : searchResults) {
            String document = searchResult.getDocument();
            List<String> list = reverseIndexIndirect.get(ThothUtil.prefix(document, "/"));
            if (list != null) {
                for (String name : list) {
                    int idx = name.lastIndexOf('.');
                    if (idx != -1) {
                        String ext = name.substring(idx + 1);
                        if (bookExtensions.contains(ext)) {
                            searchResult.addBookReference(new DocumentNode(name, name, 0, 0));
                        }
                    }
                }
            }
        }
    }

    protected Configuration getConfiguration() {
        return contentManager.getConfiguration();
    }

}