net.hillsdon.reviki.search.impl.LuceneSearcher.java Source code

Introduction

Here is the source code for net.hillsdon.reviki.search.impl.LuceneSearcher.java
Source

/**
 * Copyright 2008 Matthew Hillsdon
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.hillsdon.reviki.search.impl;

import static net.hillsdon.reviki.text.WikiWordUtils.lastComponentOfPath;
import static net.hillsdon.reviki.text.WikiWordUtils.pathToTitle;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import net.hillsdon.reviki.search.QuerySyntaxException;
import net.hillsdon.reviki.search.SearchEngine;
import net.hillsdon.reviki.search.SearchMatch;
import net.hillsdon.reviki.vc.PageInfo;
import net.hillsdon.reviki.vc.PageStoreException;
import net.hillsdon.reviki.wiki.MarkupRenderer;
import net.hillsdon.reviki.wiki.renderer.creole.ast.ASTNode;
import net.hillsdon.reviki.wiki.renderer.creole.ast.ASTVisitor;
import net.hillsdon.reviki.wiki.renderer.creole.ast.Link;

import org.apache.commons.httpclient.util.URIUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.QueryParser.Operator;
import org.apache.lucene.search.Hit;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.LockObtainFailedException;

import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;

/**
 * Uses lucene to provide search capabilities.
 *
 * @author mth
 */
public class LuceneSearcher implements SearchEngine {

    public static class NoQueryPerformedException extends RuntimeException {
        private static final long serialVersionUID = 1L;

        public NoQueryPerformedException(final QuerySyntaxException cause) {
            super("No query was performed yet got query error", cause);
        }
    }

    private static final String FIELD_UID = "uid";
    private static final String FIELD_WIKI = "wiki";
    private static final String FIELD_PATH = "path";
    private static final String FIELD_PATH_LOWER = "path-lower";
    private static final String FIELD_CONTENT = "content";
    private static final String FIELD_ATTRIBUTES = "attributes";
    /**
     * We tokenize the wiki word to allow e.g. 'another' to find 'AnotherNewPage'.
     */
    private static final String FIELD_TITLE_TOKENIZED = "title";
    private static final String FIELD_OUTGOING_LINKS = "outgoing-links";

    private static final String FIELD_PROPERTY_KEY = "property";
    private static final String FIELD_PROPERTY_VALUE = "property-value";
    private static final String PROPERTY_LAST_INDEXED_REVISION = "last-indexed-revision";
    private static final String PROPERTY_BUILT = "index-was-built";

    private static final String[] ALL_SEARCH_FIELDS = new String[] { FIELD_PATH, FIELD_PATH_LOWER,
            FIELD_TITLE_TOKENIZED, FIELD_CONTENT, FIELD_ATTRIBUTES };

    private final String _wikiName;
    private final File _dir;
    private final List<File> _otherDirs;
    private final MarkupRenderer _renderer;

    public static String uidFor(final String wiki, final String path) {
        return (wiki == null ? "" : wiki) + "::" + (path == null ? "" : path);
    }

    /**
     * @param dir The search index lives here.
     *            If null is passed the search will behave as a null implementation.
     */
    public LuceneSearcher(final String wikiName, final File dir, final List<File> otherSearchDirs,
            final MarkupRenderer renderer) {
        _wikiName = wikiName;
        _dir = dir;
        _otherDirs = otherSearchDirs;
        _renderer = renderer;
    }

    private void createIndexIfNecessary() throws IOException {
        if (_dir != null && !IndexReader.indexExists(_dir)) {
            new IndexWriter(_dir, createAnalyzer(), true).close();
        }
    }

    private Analyzer createAnalyzer() {
        final Analyzer text = new StandardAnalyzer() {
            @Override
            public TokenStream tokenStream(final String fieldName, final Reader reader) {
                return new PorterStemFilter(super.tokenStream(fieldName, reader));
            }
        };
        final Analyzer id = new KeywordAnalyzer();
        final PerFieldAnalyzerWrapper perField = new PerFieldAnalyzerWrapper(new Analyzer() {
            @Override
            public TokenStream tokenStream(final String fieldName, final Reader reader) {
                throw new UnsupportedOperationException("Need to define analyser for: " + fieldName);
            }
        });
        perField.addAnalyzer(FIELD_UID, id);
        perField.addAnalyzer(FIELD_WIKI, id);
        perField.addAnalyzer(FIELD_PATH, id);
        perField.addAnalyzer(FIELD_PATH_LOWER, id);
        perField.addAnalyzer(FIELD_TITLE_TOKENIZED, text);
        perField.addAnalyzer(FIELD_PROPERTY_KEY, id);
        perField.addAnalyzer(FIELD_PROPERTY_VALUE, id);
        perField.addAnalyzer(FIELD_OUTGOING_LINKS, text);
        perField.addAnalyzer(FIELD_CONTENT, text);
        perField.addAnalyzer(FIELD_ATTRIBUTES, id);
        return perField;
    }

    private Document createWikiPageDocument(final PageInfo page) throws IOException, PageStoreException {
        ASTNode pageAST = _renderer.parse(page);
        final String path = page.getPath();
        final String wiki = page.getWiki();
        final String content = page.getContent();
        final List<String> attributes = attributesToStringList(page.getAttributes());
        Document document = new Document();
        final String title = pathToTitle(path);
        final String pathLower = lastComponentOfPath(path).toLowerCase();
        document.add(new Field(FIELD_UID, uidFor(wiki, path), Field.Store.YES, Field.Index.UN_TOKENIZED));
        document.add(new Field(FIELD_WIKI, wiki, Field.Store.YES, Field.Index.UN_TOKENIZED));
        document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
        document.add(new Field(FIELD_PATH_LOWER, pathLower, Field.Store.YES, Field.Index.UN_TOKENIZED));
        document.add(new Field(FIELD_TITLE_TOKENIZED, title, Field.Store.YES, Field.Index.TOKENIZED));
        document.add(new Field(FIELD_OUTGOING_LINKS, Joiner.on(" ").join(findOutgoingWikiLinks(pageAST)),
                Field.Store.YES, Field.Index.TOKENIZED));
        // We store the content in order to show matching extracts.
        document.add(new Field(FIELD_CONTENT, content, Field.Store.YES, Field.Index.TOKENIZED));
        // Store the attributes like this, so that we only get matches which are exact
        for (String attribute : attributes) {
            document.add(new Field(FIELD_ATTRIBUTES, attribute, Field.Store.YES, Field.Index.UN_TOKENIZED));
        }
        return document;
    }

    private List<String> findOutgoingWikiLinks(ASTNode ast) {
        return (new Visitor()).visit(ast);
    }

    private final class Visitor extends ASTVisitor<List<String>> {
        private static final String NEW_PAGE_CLASS = "new-page";
        private static final String EXIST_PAGE_CLASS = "existing-page";

        @Override
        public List<String> visitASTNode(ASTNode node) {
            List<String> outgoing = new ArrayList<String>();

            for (ASTNode child : node.getChildren()) {
                outgoing.addAll(visit(child));
            }

            return outgoing;
        }

        @Override
        public List<String> visitLink(Link node) {
            try {
                String style = node.getParts().getStyleClass(node.getContext());
                String href = node.getParts().getURL(node.getContext());

                if (style.equals(NEW_PAGE_CLASS) || style.equals(EXIST_PAGE_CLASS)) {
                    return ImmutableList.of(href.substring(href.lastIndexOf('/') + 1));
                }
            } catch (Exception e) {
                // Ignore the bad link, we only care about links to pages on this wiki.
            }

            return ImmutableList.of();
        }
    }

    private Document createPropertyDocument(final String property, final String value) {
        Document document = new Document();
        document.add(new Field(FIELD_PROPERTY_KEY, property, Field.Store.YES, Field.Index.UN_TOKENIZED));
        document.add(new Field(FIELD_PROPERTY_VALUE, value, Field.Store.YES, Field.Index.UN_TOKENIZED));
        return document;
    }

    private void deleteWikiDocument(final String wiki, final String path) throws IOException {
        deleteDocument(FIELD_UID, uidFor(wiki, path));
    }

    private void deleteDocument(final String keyField, final String value) throws IOException {
        IndexWriter writer = new IndexWriter(_dir, createAnalyzer());
        try {
            writer.deleteDocuments(new Term(keyField, value));
        } finally {
            writer.close();
        }
    }

    private void replaceProperty(final Document propertyDocument)
            throws CorruptIndexException, LockObtainFailedException, IOException {
        replaceDocument(FIELD_PROPERTY_KEY, propertyDocument);
    }

    private void replaceWikiDocument(final Document wikiDocument)
            throws CorruptIndexException, LockObtainFailedException, IOException {
        replaceDocument(FIELD_UID, wikiDocument);
    }

    private void replaceDocument(final String keyField, final Document document)
            throws CorruptIndexException, LockObtainFailedException, IOException {
        IndexWriter writer = new IndexWriter(_dir, createAnalyzer());
        try {
            writer.deleteDocuments(new Term(keyField, document.get(keyField)));
            writer.addDocument(document);
            writer.optimize();
        } finally {
            writer.close();
        }
    }

    // Lucene allows multiple non-deleting readers and at most one writer at a time.
    // It maintains a lock file but we never want it to fail to take the lock, so serialize writes.
    public synchronized void index(final PageInfo page, final boolean buildingIndex)
            throws IOException, PageStoreException {
        if (_dir == null) {
            return;
        }
        if (!isIndexBeingBuilt() || buildingIndex) {
            createIndexIfNecessary();
            replaceWikiDocument(createWikiPageDocument(page));
        }
    }

    private List<String> attributesToStringList(Map<String, String> attributes) {
        List<String> attrs = new ArrayList<String>();
        for (Map.Entry<String, String> entry : attributes.entrySet()) {
            attrs.add("\"" + entry.getKey() + "\":\"" + entry.getValue() + "\"");
        }
        return attrs;
    }

    // See comment on index.
    public synchronized void delete(final String wiki, final String path, boolean buildingIndex)
            throws IOException {
        if (!isIndexBeingBuilt() || buildingIndex) {
            createIndexIfNecessary();
            deleteWikiDocument(wiki, path);
        }
    }

    public Set<String> incomingLinks(final String page) throws IOException, PageStoreException {
        if (_dir == null) {
            return Collections.emptySet();
        }
        try {
            return doReadOperation(new ReadOperation<Set<String>>() {
                public Set<String> execute(final IndexReader reader, final Searcher searcher,
                        final Analyzer analyzer) throws IOException, ParseException {
                    final String pageEscaped = escape(URIUtil.encodeWithinPath(page));
                    Set<String> results = Sets.newLinkedHashSet(Iterables.transform(
                            query(reader, createAnalyzer(), searcher, FIELD_OUTGOING_LINKS, pageEscaped, false),
                            SearchMatch.TO_PAGE_NAME));
                    results.remove(page);
                    return results;
                }
            }, false);
        } catch (QuerySyntaxException ex) {
            throw new NoQueryPerformedException(ex);
        }
    }

    public Set<String> outgoingLinks(final String page) throws IOException, PageStoreException {
        if (_dir == null) {
            return Collections.emptySet();
        }
        try {
            return doReadOperation(new ReadOperation<Set<String>>() {
                public Set<String> execute(final IndexReader reader, final Searcher searcher,
                        final Analyzer analyzer) throws IOException, ParseException {
                    Hits hits = searcher.search(new TermQuery(new Term(FIELD_PATH, page)));
                    Iterator<?> iterator = hits.iterator();
                    if (iterator.hasNext()) {
                        Hit hit = (Hit) iterator.next();
                        String outgoingLinks = hit.getDocument().get(FIELD_OUTGOING_LINKS);
                        Set<String> results = Sets.newHashSet(outgoingLinks.split("\\s"));
                        results.remove(page);
                        return ImmutableSet.copyOf(results);
                    }
                    return Collections.emptySet();
                }
            }, false);
        } catch (QuerySyntaxException ex) {
            throw new NoQueryPerformedException(ex);
        }
    }

    /**
     * Reusable template that cleans up properly.
     * @param <T> Result type.
     * @param operation Operation to perform.
     * @param allIndices If true, search all indices (other wikis) not just our own.
     * @return Result from operation.
     * @throws IOException On index read error,
     * @throws QuerySyntaxException If we can't parse a query.
     */
    private <T> T doReadOperation(final ReadOperation<T> operation, final boolean allIndices)
            throws IOException, QuerySyntaxException {
        createIndexIfNecessary();

        List<Searcher> searchers = new ArrayList<Searcher>();
        List<IndexReader> readers = new ArrayList<IndexReader>();

        /* First add our reader/searcher. If this fails, it's an error but clean up. */
        IndexReader reader = IndexReader.open(_dir);
        Searcher searcher = null;
        try {
            searcher = new IndexSearcher(reader);
            searchers.add(searcher);
            readers.add(reader);
        } finally {
            if (searcher == null) {
                reader.close();
            }
        }

        if (allIndices) {
            for (File dir : _otherDirs) {
                searcher = null;
                reader = null;
                try {
                    reader = IndexReader.open(dir);
                    searcher = new IndexSearcher(reader);
                    searchers.add(searcher);
                    readers.add(reader);
                } catch (Exception e) {
                    // The index may not exist, but other wikis' indices aren't that important anyway, so
                    // just don't search them.
                    if (searcher != null) {
                        searcher.close();
                    }
                    if (reader != null) {
                        reader.close();
                    }
                }
            }
        }

        try {
            /* Don't bother using a multi searcher if we only have one */
            if (searchers.size() > 1) {
                searcher = new MultiSearcher(searchers.toArray(new Searcher[] {}));
                /* Add to list of searchers so it gets closed */
                searchers.add(searcher);
            } else {
                searcher = searchers.get(0);
            }

            try {
                Analyzer analyzer = createAnalyzer();
                return operation.execute(readers.get(0), searcher, analyzer);
            } catch (ParseException ex) {
                throw new QuerySyntaxException(ex.getMessage(), ex);
            }
        } finally {
            for (Searcher s : searchers) {
                try {
                    s.close();
                } catch (Exception e) {
                }
            }
            for (IndexReader r : readers) {
                try {
                    r.close();
                } catch (Exception e) {
                }
            }
        }
    }

    public Set<SearchMatch> search(final String queryString, final boolean provideExtracts,
            final boolean singleWiki) throws IOException, QuerySyntaxException {
        if (_dir == null || queryString == null || queryString.trim().length() == 0) {
            return Collections.emptySet();
        }
        return doReadOperation(new ReadOperation<Set<SearchMatch>>() {
            public Set<SearchMatch> execute(final IndexReader reader, final Searcher searcher,
                    final Analyzer analyzer) throws IOException, ParseException {
                LinkedHashSet<SearchMatch> results = new LinkedHashSet<SearchMatch>();
                // Prefer path, then title then content matches (match equality is on page name)
                for (String field : ALL_SEARCH_FIELDS) {
                    if (field.equals(FIELD_PATH_LOWER)) {
                        final Query query = new WildcardQuery(
                                new Term(FIELD_PATH_LOWER, "*" + queryString.toLowerCase() + "*"));
                        results.addAll(doQuery(reader, analyzer, searcher, field, provideExtracts, query));
                    } else {
                        results.addAll(query(reader, analyzer, searcher, field, queryString, provideExtracts));
                    }
                }
                return orderResults(results);
            }
        }, !singleWiki);
    }

    private Set<SearchMatch> orderResults(final Set<SearchMatch> results) {
        /* Split single set of results into per-wiki sets, maintaining order (within each set) */
        Map<String, Set<SearchMatch>> byWiki = new LinkedHashMap<String, Set<SearchMatch>>();
        for (SearchMatch match : results) {
            Set<SearchMatch> matchesForWiki = byWiki.get(match.getWiki());
            if (matchesForWiki == null) {
                byWiki.put(match.getWiki(), matchesForWiki = new LinkedHashSet<SearchMatch>());
            }
            matchesForWiki.add(match);
        }

        Set<SearchMatch> sortedSet = new LinkedHashSet<SearchMatch>();

        /* Find the set for this wiki, and force it to be first */
        Set<SearchMatch> sameWikiMatches = byWiki.get(_wikiName);
        if (sameWikiMatches != null) {
            sortedSet.addAll(sameWikiMatches);
            byWiki.remove(_wikiName);
        }

        /* Flatten remaining per-wiki sets into single set, maintaining order */
        for (Set<SearchMatch> matches : byWiki.values()) {
            sortedSet.addAll(matches);
        }
        return sortedSet;
    }

    private LinkedHashSet<SearchMatch> query(final IndexReader reader, final Analyzer analyzer,
            final Searcher searcher, final String field, final String queryString, final boolean provideExtracts)
            throws IOException, ParseException {
        QueryParser parser = new QueryParser(field, analyzer);
        parser.setLowercaseExpandedTerms(!FIELD_PATH.equals(field));
        parser.setDefaultOperator(Operator.AND);
        Query query = parser.parse(queryString);
        return doQuery(reader, analyzer, searcher, field, provideExtracts, query);
    }

    private LinkedHashSet<SearchMatch> doQuery(final IndexReader reader, final Analyzer analyzer,
            final Searcher searcher, final String field, final boolean provideExtracts, final Query query)
            throws IOException, CorruptIndexException {
        Highlighter highlighter = null;
        if (provideExtracts) {
            highlighter = new Highlighter(new SimpleHTMLFormatter("<strong>", "</strong>"), new SimpleHTMLEncoder(),
                    new QueryScorer(query));
        }
        Hits hits = searcher.search(query);
        LinkedHashSet<SearchMatch> results = new LinkedHashSet<SearchMatch>();
        @SuppressWarnings("unchecked")
        Iterator<Hit> iter = hits.iterator();
        while (iter.hasNext()) {
            Hit hit = iter.next();
            String text = hit.get(field);
            String extract = null;
            // The text is not stored for all fields, just provide a null extract.
            if (highlighter != null && text != null) {
                TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(text));
                // Get 3 best fragments and separate with a "..."
                extract = highlighter.getBestFragments(tokenStream, text, 3, "...");
            }
            results.add(new SearchMatch(_wikiName.equals(hit.get(FIELD_WIKI)), hit.get(FIELD_WIKI),
                    hit.get(FIELD_PATH), extract));
        }
        return results;
    }

    public long getHighestIndexedRevision() throws IOException {
        createIndexIfNecessary();
        String property = getProperty(PROPERTY_LAST_INDEXED_REVISION);
        try {
            if (property != null) {
                return Long.valueOf(property);
            }
        } catch (NumberFormatException ex) {
            // Fallthrough to default.
        }
        return 0;
    }

    private String getProperty(final String propertyName) throws IOException {
        if (_dir == null) {
            return null;
        }
        try {
            return doReadOperation(new ReadOperation<String>() {
                public String execute(final IndexReader reader, final Searcher searcher, final Analyzer analyzer)
                        throws IOException, ParseException {
                    Hits hits = searcher.search(new TermQuery(new Term(FIELD_PROPERTY_KEY, propertyName)));
                    Iterator<?> iterator = hits.iterator();
                    if (iterator.hasNext()) {
                        return ((Hit) iterator.next()).get(FIELD_PROPERTY_VALUE);
                    }
                    return null;
                }
            }, false);
        } catch (QuerySyntaxException ex) {
            throw new NoQueryPerformedException(ex);
        }
    }

    public void rememberHighestIndexedRevision(final long revision)
            throws CorruptIndexException, LockObtainFailedException, IOException {
        replaceProperty(createPropertyDocument(PROPERTY_LAST_INDEXED_REVISION, String.valueOf(revision)));
    }

    public boolean isIndexBeingBuilt() throws IOException {
        createIndexIfNecessary();
        String property = getProperty(PROPERTY_BUILT);
        if (property != null) {
            return Boolean.valueOf(property);
        }
        return false;
    }

    public void setIndexBeingBuilt(boolean buildingIndex) throws IOException {
        createIndexIfNecessary();
        replaceProperty(createPropertyDocument(PROPERTY_BUILT, String.valueOf(buildingIndex)));
    }

    public String escape(final String in) {
        return QueryParser.escape(in);
    }
}