org.eclipse.skalli.core.search.LuceneIndex.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.skalli.core.search.LuceneIndex.java

Source

/*******************************************************************************
 * Copyright (c) 2010-2014 SAP AG and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     SAP AG - initial API and implementation
 *******************************************************************************/
package org.eclipse.skalli.core.search;

import java.io.Closeable;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.UUID;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.text.StrBuilder;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LimitTokenCountAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.eclipse.skalli.model.EntityBase;
import org.eclipse.skalli.model.ExtensibleEntityBase;
import org.eclipse.skalli.services.entity.EntityService;
import org.eclipse.skalli.services.extension.ExtensionService;
import org.eclipse.skalli.services.extension.ExtensionServices;
import org.eclipse.skalli.services.extension.Indexer;
import org.eclipse.skalli.services.search.FacetedSearchResult;
import org.eclipse.skalli.services.search.IndexEntry;
import org.eclipse.skalli.services.search.PagingInfo;
import org.eclipse.skalli.services.search.QueryParseException;
import org.eclipse.skalli.services.search.SearchHit;
import org.eclipse.skalli.services.search.SearchResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LuceneIndex<T extends EntityBase> {

    private static final Version LUCENE_VERSION = Version.LUCENE_30;

    private static final Logger LOG = LoggerFactory.getLogger(LuceneIndex.class);

    private static final SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>"); //$NON-NLS-1$//$NON-NLS-2$
    private static final String FIELD_UUID = "_uuid"; //$NON-NLS-1$
    private static final int NUMBER_BEST_FRAGMENTS = 3; //TODO this is a candidate for configuration

    private Directory directory = new RAMDirectory();
    private Analyzer analyzer = new LimitTokenCountAnalyzer(new StandardAnalyzer(LUCENE_VERSION),
            Integer.MAX_VALUE);
    private boolean initialized;

    private final EntityService<T> entityService;

    public LuceneIndex(EntityService<T> entityService) {
        this.entityService = entityService;
    }

    public synchronized void reindexAll() {
        directory = new RAMDirectory();
        addEntitiesToIndex(entityService.getAll());
        initialized = true;
    }

    public synchronized void reindex(Collection<T> entities) {
        directory = new RAMDirectory();
        addEntitiesToIndex(entities);
        initialized = true;
    }

    private List<IndexEntry> indexEntity(T entity) {
        List<IndexEntry> fields = new LinkedList<IndexEntry>();

        Queue<EntityBase> queue = new LinkedList<EntityBase>();
        queue.add(entity);

        while (!queue.isEmpty()) {
            EntityBase currentEntity = queue.poll();

            for (ExtensionService<?> extensionService : ExtensionServices.getAll()) {
                if (currentEntity.getClass().equals(extensionService.getExtensionClass())) {
                    Indexer<?> indexer = extensionService.getIndexer();
                    if (indexer != null) {
                        indexer.indexEntity(fields, currentEntity);
                    }
                }
            }

            if (currentEntity instanceof ExtensibleEntityBase) {
                queue.addAll(((ExtensibleEntityBase) currentEntity).getAllExtensions());
            }
        }
        return fields;
    }

    private void addEntityToIndex(IndexWriter writer, T entity) throws IOException {
        List<IndexEntry> fields = indexEntity(entity);

        Document doc = LuceneUtil.fieldsToDocument(fields);
        doc.add(new Field(FIELD_UUID, entity.getUuid().toString(), Store.YES, Index.NOT_ANALYZED));
        writer.addDocument(doc);
    }

    List<SearchHit<T>> entitiesToHit(Collection<T> entities) {
        List<SearchHit<T>> ret = new LinkedList<SearchHit<T>>();
        for (T entity : entities) {
            ret.add(entityToHit(entity));
        }
        return ret;
    }

    SearchHit<T> entityToHit(T entity) {
        if (entity == null) {
            return null;
        }
        List<IndexEntry> fields = indexEntity(entity);
        Map<String, List<String>> storedValues = new HashMap<String, List<String>>();
        for (IndexEntry entry : fields) {
            List<String> list = storedValues.get(entry.getFieldName());
            if (list == null) {
                list = new LinkedList<String>();
                storedValues.put(entry.getFieldName(), list);
            }
            list.add(entry.getValue());
        }
        SearchHit<T> ret = new SearchHit<T>(entity, storedValues, storedValues);
        return ret;
    }

    private void addEntitiesToIndex(Collection<T> entities) {
        IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer);
        IndexWriter writer = null;
        try {
            writer = new IndexWriter(directory, config);
            for (T entity : entities) {
                if (!entity.isDeleted()) {
                    addEntityToIndex(writer, entity);
                }
            }
        } catch (LockObtainFailedException e) {
            LOG.error("Failed to add index entries due to Lucene lock", e);
        } catch (Exception e) {
            LOG.error("Failed to add index entries", e);
        } finally {
            closeQuietly(writer);
        }
    }

    private String doHighlight(final Highlighter highlighter, final List<String> fields, final String fieldName,
            String fieldContents) throws IOException {
        String highlighted = fieldContents;
        if (fieldContents != null && fields.contains(fieldName)) {
            try {
                String[] fragments = highlighter.getBestFragments(analyzer, fieldName, fieldContents,
                        NUMBER_BEST_FRAGMENTS);
                if (fragments != null && fragments.length > 0) {
                    highlighted = LuceneUtil.withEllipsis(fragments, fieldContents);
                }
            } catch (Exception e) {
                LOG.error(MessageFormat.format("Failed to highlight search result ''{0}''", fieldContents), e);
            }
        }
        return highlighted;
    }

    private ScoreDoc getDocByUUID(IndexSearcher searcher, UUID uuid) throws IOException {
        Query query = null;
        try {
            QueryParser parser = new QueryParser(LUCENE_VERSION, FIELD_UUID, analyzer);
            query = parser.parse(StringUtils.lowerCase(uuid.toString()));
        } catch (ParseException e) {
            LOG.error(MessageFormat.format("Failed to create query from UUID {0}", uuid.toString()), e);
            return null;
        }
        TopScoreDocCollector collector = TopScoreDocCollector.create(2, false);
        searcher.search(query, collector);
        if (collector.getTotalHits() < 1) {
            return null;
        }
        if (collector.getTotalHits() > 1) {
            LOG.error(MessageFormat.format("Too many documents found with UUID {0}", uuid.toString()));
            return null;
        }
        ScoreDoc hit = collector.topDocs().scoreDocs[0];
        return hit;
    }

    public synchronized void remove(final Collection<T> entities) {
        if (!initialized) {
            return;
        }
        IndexReader reader = null;
        IndexSearcher searcher = null;
        try {
            reader = IndexReader.open(directory, false);
            searcher = new IndexSearcher(reader);
            for (EntityBase entity : entities) {
                ScoreDoc hit = getDocByUUID(searcher, entity.getUuid());
                if (hit != null) {
                    searcher.getIndexReader().deleteDocument(hit.doc);
                }
            }
        } catch (LockObtainFailedException e) {
            LOG.error("Failed to remove index entries due to Lucene lock", e);
        } catch (Exception e) {
            LOG.error("Failed to remove index entries", e);
        } finally {
            closeQuietly(searcher);
            closeQuietly(reader);
        }
    }

    public synchronized void update(final Collection<T> entities) {
        if (!initialized) {
            return;
        }
        remove(entities);
        addEntitiesToIndex(entities);
    }

    private T getEntity(Document doc) {
        T ret = entityService.getByUUID(UUID.fromString(doc.get(FIELD_UUID)));
        return ret;
    }

    private SearchHit<T> getSearchHit(final Document doc, final List<String> fields, float score,
            final Highlighter highlighter) throws IOException {
        T entity = getEntity(doc);
        Map<String, List<String>> storedValues = new HashMap<String, List<String>>();
        Map<String, List<String>> highlightedValues = new HashMap<String, List<String>>();

        for (Fieldable f : doc.getFields()) {
            if (!f.isStored()) {
                continue;
            }
            String[] values = doc.getValues(f.name());
            List<String> fieldContents = Arrays.asList(values);
            List<String> highlightedFieldContents = Arrays.asList(values.clone());
            if (fields.contains(f.name())) {
                for (int i = 0; i < highlightedFieldContents.size(); i++) {
                    highlightedFieldContents.set(i,
                            doHighlight(highlighter, fields, f.name(), highlightedFieldContents.get(i)));
                }
            }
            storedValues.put(f.name(), fieldContents);
            highlightedValues.put(f.name(), highlightedFieldContents);
        }

        SearchHit<T> ret = new SearchHit<T>(entity, storedValues, score, highlightedValues);
        return ret;
    }

    public synchronized SearchResult<T> moreLikeThis(T entity, String[] fields, int count) {
        long start = System.nanoTime();
        SearchResult<T> moreLikeThis = new SearchResult<T>();
        List<SearchHit<T>> searchHits = new LinkedList<SearchHit<T>>();
        PagingInfo pagingInfo = new PagingInfo(0, 0);
        int totalHitCount = 0;
        if (initialized) {
            IndexReader reader = null;
            IndexSearcher searcher = null;
            try {
                reader = IndexReader.open(directory);
                searcher = new IndexSearcher(reader);
                ScoreDoc baseDoc = getDocByUUID(searcher, entity.getUuid());
                if (baseDoc != null) {
                    MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
                    mlt.setFieldNames(fields);
                    mlt.setMinWordLen(2);
                    mlt.setBoost(true);
                    mlt.setMinDocFreq(0);
                    mlt.setMinTermFreq(0);
                    mlt.setAnalyzer(analyzer);
                    Query query = mlt.like(baseDoc.doc);
                    int numHits = Math.min(count + 1, entityService.size()); // count + 1: baseDoc will be one of the hits
                    TopScoreDocCollector collector = TopScoreDocCollector.create(numHits, false);
                    searcher.search(query, collector);

                    List<String> fieldList = Arrays.asList(fields);
                    Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
                    for (ScoreDoc hit : collector.topDocs().scoreDocs) {
                        if (hit.doc != baseDoc.doc) {
                            Document doc = searcher.doc(hit.doc);
                            SearchHit<T> searchHit = getSearchHit(doc, fieldList, hit.score, highlighter);
                            searchHits.add(searchHit);
                        }
                    }
                    pagingInfo = new PagingInfo(0, count);
                    totalHitCount = collector.getTotalHits() - 1;
                }
            } catch (Exception e) {
                LOG.error(
                        MessageFormat.format("Searching for entities similiar to ''{0}'' failed", entity.getUuid()),
                        e);
            } finally {
                closeQuietly(searcher);
                closeQuietly(reader);
            }
        }

        long nanoDuration = System.nanoTime() - start;
        long milliDuration = Math.round(nanoDuration / 1000000d);
        moreLikeThis.setPagingInfo(pagingInfo);
        moreLikeThis.setResultCount(totalHitCount);
        moreLikeThis.setResult(searchHits);
        moreLikeThis.setDuration(milliDuration);

        moreLikeThis.setResult(searchHits);
        return moreLikeThis;
    }

    public synchronized SearchResult<T> search(String[] fields, String queryString, PagingInfo pagingInfo)
            throws QueryParseException {
        SearchResult<T> ret = new SearchResult<T>();
        search(fields, null, queryString, pagingInfo, ret);
        return ret;
    }

    public synchronized SearchResult<T> searchPhrase(String[] fields, String queryString, PagingInfo pagingInfo)
            throws QueryParseException {
        return search(fields, "\"" + queryString + "\"", pagingInfo); //$NON-NLS-1$ //$NON-NLS-2$
    }

    public synchronized FacetedSearchResult<T> facetedSearch(String[] fields, String[] facetFields,
            String queryString, PagingInfo pagingInfo) throws QueryParseException {
        FacetedSearchResult<T> ret = new FacetedSearchResult<T>();
        search(fields, facetFields, queryString, pagingInfo, ret);
        return ret;
    }

    private <R extends SearchResult<T>> R search(final String[] fields, String facetFields[],
            final String queryString, PagingInfo pagingInfo, R ret) throws QueryParseException {
        long start = System.nanoTime();
        List<SearchHit<T>> resultList = new LinkedList<SearchHit<T>>();
        int totalHitCount = 0;
        if (pagingInfo == null) {
            pagingInfo = new PagingInfo(0, 10);
        }
        if (StringUtils.equals("*", queryString) || StringUtils.isEmpty(queryString)) { //$NON-NLS-1$
            List<T> allEntities = entityService.getAll();
            List<T> sublist = allEntities.subList(Math.min(pagingInfo.getStart(), allEntities.size()),
                    Math.min(pagingInfo.getStart() + pagingInfo.getCount(), allEntities.size()));
            resultList.addAll(entitiesToHit(sublist));
            totalHitCount = allEntities.size();
        } else if (initialized) {
            List<String> fieldList = Arrays.asList(fields);
            IndexReader reader = null;
            IndexSearcher searcher = null;
            try {
                reader = IndexReader.open(directory);
                searcher = new IndexSearcher(reader);
                QueryParser parser = new MultiFieldQueryParser(LUCENE_VERSION, fields, analyzer);
                Query query = getQuery(parser, queryString);

                // it is not possible that we have more hits than projects!
                int maxHits = entityService.size();
                int numHits = pagingInfo.getStart() + pagingInfo.getCount();
                if (numHits < 0 || numHits > maxHits) {
                    numHits = maxHits;
                }
                if (numHits > 0) {
                    TopDocsCollector<ScoreDoc> collector;
                    if (facetFields == null) {
                        collector = TopScoreDocCollector.create(numHits, false);
                    } else {
                        collector = new FacetedCollector(facetFields, searcher.getIndexReader(), numHits);
                    }

                    searcher.search(query, collector);
                    Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
                    TopDocs topDocs = collector.topDocs(pagingInfo.getStart(), pagingInfo.getCount());
                    for (ScoreDoc hit : topDocs.scoreDocs) {
                        Document doc = searcher.doc(hit.doc);
                        SearchHit<T> searchHit = getSearchHit(doc, fieldList, hit.score, highlighter);
                        resultList.add(searchHit);
                    }

                    totalHitCount = collector.getTotalHits();
                    if (collector instanceof FacetedCollector && ret instanceof FacetedSearchResult) {
                        ((FacetedSearchResult<T>) ret).setFacetInfo(((FacetedCollector) collector).getFacetsMap());
                    }
                }
            } catch (Exception e) {
                LOG.error(MessageFormat.format("Searching with query ''{0}'' failed", queryString), e);
            } finally {
                closeQuietly(searcher);
                closeQuietly(reader);
            }
        }

        long nanoDuration = System.nanoTime() - start;
        long milliDuration = Math.round(nanoDuration / 1000000d);
        ret.setPagingInfo(pagingInfo);
        ret.setQueryString(queryString);
        ret.setResultCount(totalHitCount);
        ret.setResult(resultList);
        ret.setDuration(milliDuration);
        return ret;
    }

    private Query getQuery(QueryParser parser, String queryString) throws QueryParseException {
        Query query = null;
        String extendedQuery = getExtendedQuery(queryString);
        try {
            query = parser.parse(extendedQuery);
        } catch (ParseException e1) {
            // if the parsing fails escape the query string and try again
            String escapedQueryString = QueryParser.escape(queryString);
            try {
                query = parser.parse(escapedQueryString);
            } catch (ParseException ex) {
                // if that fails, too, give up
                throw new QueryParseException(ex);
            }
        }
        return query;
    }

    static String getExtendedQuery(String queryString) {
        StrBuilder extendedQuery = new StrBuilder();
        if (StringUtils.isNotBlank(queryString)) {
            StrBuilder term = new StrBuilder();
            boolean isSimpleTerm = true;
            boolean insideQuotes = false;
            boolean insideBrackets = false;
            char openedBracket = '\0';
            int pos = 0;
            int len = queryString.length();
            while (pos < len) {
                char c = queryString.charAt(pos++);
                if (c == '"') {
                    isSimpleTerm = false;
                    insideQuotes = !insideQuotes;
                    term.append(c);
                } else if (c == '(' || c == '[' || c == '{') {
                    isSimpleTerm = false;
                    insideBrackets = true;
                    openedBracket = c;
                    term.append(c);
                } else if (c == ')' || c == ']' || c == '}') {
                    isSimpleTerm = false;
                    if (c == ')' && openedBracket == '(' || c == ']' && openedBracket == '['
                            || c == '}' && openedBracket == '{') {
                        insideBrackets = false;
                        openedBracket = '\0';
                    }
                    term.append(c);
                } else if (insideQuotes || insideBrackets) {
                    term.append(c);
                } else if (c == '*' || c == '?' || c == '~' || c == '+' || c == '-' || c == '!' || c == ':'
                        || c == '^' || c == '|' || c == '&' || c == '\\') {
                    isSimpleTerm = false;
                    term.append(c);
                } else if (Character.isWhitespace(c)) {
                    addTerm(extendedQuery, term, isSimpleTerm);
                    isSimpleTerm = true;
                    insideQuotes = false;
                    insideBrackets = false;
                    openedBracket = '\0';
                    term.setLength(0);
                } else {
                    term.append(c);
                }
            }
            addTerm(extendedQuery, term, isSimpleTerm);
        }
        return extendedQuery.toString();
    }

    private static final StrBuilder AND = new StrBuilder("AND"); //$NON-NLS-1$
    private static final StrBuilder OR = new StrBuilder("OR"); //$NON-NLS-1$
    private static final StrBuilder NOT = new StrBuilder("NOT"); //$NON-NLS-1$
    private static final StrBuilder TO = new StrBuilder("TO"); //$NON-NLS-1$

    static private void addTerm(StrBuilder query, StrBuilder term, boolean isSimpleTerm) {
        term.trim();
        if (term.length() > 0) {
            if (query.length() > 0) {
                query.append(' ');
            }
            if (term.equals(AND) || term.equals(OR) || term.equals(NOT) || term.equals(TO)) {
                isSimpleTerm = false;
            }
            if (isSimpleTerm) {
                query.append('(');
                query.append('"').append(term).append('"');
                query.append(' ').append(term).append('*');
                query.append(' ').append(term).append('~');
                query.append(')');
            } else {
                query.append(term);
            }
        }
    }

    private void closeQuietly(Closeable closable) {
        try {
            if (closable != null) {
                closable.close();
            }
        } catch (IOException e) {
            LOG.error(MessageFormat.format("Failed to close {0}", closable.getClass().getName()), e);
        }
    }
}