com.tripod.lucene.service.AbstractLuceneService.java Source code

Introduction

Here is the source code for com.tripod.lucene.service.AbstractLuceneService.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.tripod.lucene.service;

import com.tripod.api.Field;
import com.tripod.api.TransformException;
import com.tripod.api.query.SortOrder;
import com.tripod.api.query.result.FacetCount;
import com.tripod.api.query.result.FacetResult;
import com.tripod.api.query.result.Highlight;
import com.tripod.api.query.result.QueryResult;
import com.tripod.api.query.service.QueryException;
import com.tripod.lucene.LuceneField;
import com.tripod.lucene.query.LuceneQuery;
import com.tripod.lucene.query.LuceneQueryResults;
import com.tripod.lucene.query.LuceneQueryTransformer;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.Validate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Base class for all Lucene services.
 *
 * NOTE: Clients should use SearcherManagerRefresher in order to periodically open new searchers and see new data.
 *
 * @author bbende
 */
public class AbstractLuceneService<Q extends LuceneQuery, QR extends QueryResult> {

    private static final Logger LOGGER = LoggerFactory.getLogger(AbstractLuceneService.class);

    protected final Analyzer analyzer;
    protected final SearcherManager searcherManager;
    protected final LuceneQueryTransformer<Q> queryTransformer;
    protected final LuceneDocumentTransformer<QR> documentTransformer;

    public AbstractLuceneService(final SearcherManager searcherManager, final Analyzer analyzer,
            final LuceneQueryTransformer<Q> queryTransformer,
            final LuceneDocumentTransformer<QR> documentTransformer) {
        this.searcherManager = searcherManager;
        this.analyzer = analyzer;
        this.queryTransformer = queryTransformer;
        this.documentTransformer = documentTransformer;
        Validate.notNull(this.searcherManager);
        Validate.notNull(this.queryTransformer);
        Validate.notNull(this.documentTransformer);
    }

    /**
     * Common logic for sub-classes to perform searches.
     *
     * @param query the query
     * @return the QueryResults
     * @throws QueryException if an error occurred performing the search
     */
    protected LuceneQueryResults<QR> performSearch(Q query) throws QueryException {
        IndexSearcher searcher = null;
        try {
            // Acquire an IndexSearcher
            searcher = searcherManager.acquire();

            // Start the results builder with the offset and rows from the query
            final LuceneQueryResults.Builder<QR> resultsBuilder = new LuceneQueryResults.Builder<QR>()
                    .pageSize(query.getRows());

            // Create a searcher and get a Lucene query
            final Query luceneQuery = queryTransformer.transform(query);

            // Get the return fields
            final Set<String> fieldsToLoad = new HashSet<>();
            if (query.getReturnFields() != null) {
                query.getReturnFields().stream().forEach(f -> fieldsToLoad.add(f.getName()));
            }

            // Get the facet fields
            final Set<String> facetFields = new HashSet<>();
            if (query.getFacetFields() != null) {
                query.getFacetFields().stream().forEach(f -> facetFields.add(f.getName()));
            }

            final Sort sort = getSort(query.getSorts());
            final Highlighter highlighter = getHighlighter(query, luceneQuery);

            // Collector to use when faceting
            final FacetsCollector facetsCollector = new FacetsCollector();

            // Collector for sorted/paged results
            final TopFieldCollector topFieldCollector = TopFieldCollector.create(sort, query.getRows(),
                    (FieldDoc) query.getAfterDoc(), true, false, false);

            // Wrapped collector depending on whether faceting or not
            final Collector collector = facetFields.isEmpty() ? MultiCollector.wrap(topFieldCollector)
                    : MultiCollector.wrap(topFieldCollector, facetsCollector);

            // Perform the Lucene query
            final long startTime = System.currentTimeMillis();
            FacetsCollector.searchAfter(searcher, query.getAfterDoc(), luceneQuery, query.getRows(), sort,
                    collector);
            LOGGER.debug("Query executed in " + (System.currentTimeMillis() - startTime));

            // Transform each Lucene Document to a QueryResult
            ScoreDoc afterDoc = null;
            for (ScoreDoc scoreDoc : topFieldCollector.topDocs().scoreDocs) {
                final Document doc = getDoc(searcher, scoreDoc.doc, fieldsToLoad);
                final QR result = documentTransformer.transform(doc);
                performHighlighting(searcher, query, scoreDoc, doc, highlighter, result);

                resultsBuilder.addResult(result);
                afterDoc = scoreDoc;
            }

            // Get faceting results
            processFacetResults(searcher, facetsCollector, facetFields, resultsBuilder);

            // Store the last ScoreDoc so it can be passed back for the next page
            resultsBuilder.afterDoc(afterDoc);
            resultsBuilder.totalResults(topFieldCollector.getTotalHits());
            return resultsBuilder.build();

        } catch (TransformException e) {
            throw new QueryException("A transform error occurred");
        } catch (IOException | InvalidTokenOffsetsException e) {
            throw new QueryException("Unexpected error occurred performing query", e);
        } finally {
            if (searcher != null) {
                try {
                    searcherManager.release(searcher);
                } catch (IOException e) {
                    LOGGER.warn("Error releasing IndexSearcher: " + e.getMessage(), e);
                }
                searcher = null;
            }
        }
    }

    /**
     * @param query the tripod query being performed
     * @param luceneQuery the Lucene query being performed
     * @return the highlighter to use if the tripod query has one or more highlight fields, or null
     */
    private Highlighter getHighlighter(final Q query, final Query luceneQuery) {
        Highlighter highlighter = null;
        if (query.getHighlightFields() != null && query.getHighlightFields().size() > 0) {
            SimpleHTMLEncoder simpleHTMLEncoder = new SimpleHTMLEncoder();
            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(query.getHighlightPreTag(),
                    query.getHighlightPostTag());
            highlighter = new Highlighter(simpleHTMLFormatter, simpleHTMLEncoder, new QueryScorer(luceneQuery));
        }
        return highlighter;
    }

    /**
     * @param searcher the IndexSearcher
     * @param doc the doc to load
     * @param fieldsToLoad the fields of the doc to load
     * @return the Document with the given fields loaded
     * @throws IOException if an error occurs loading the Document
     */
    protected Document getDoc(final IndexSearcher searcher, final int doc, final Set<String> fieldsToLoad)
            throws IOException {
        if (fieldsToLoad == null || fieldsToLoad.size() == 0
                || (fieldsToLoad.size() == 1 && fieldsToLoad.contains(Field.ALL_FIELDS.getName()))) {
            return searcher.doc(doc);
        } else {
            return searcher.doc(doc, fieldsToLoad);
        }
    }

    /**
     * Converts the Tripod Sort clauses to a Lucene Sort instance.
     *
     * @param sorts the Tripod Sorts
     * @return the Lucene Sort matching the given Tripod Sorts,
     *              or the Lucene Sort for relevance order if no sorts are specified
     */
    protected Sort getSort(List<com.tripod.api.query.Sort<LuceneField>> sorts) {
        if (sorts == null || sorts.isEmpty()) {
            return Sort.RELEVANCE;
        } else {
            List<SortField> luceneSorts = new ArrayList<>();
            for (com.tripod.api.query.Sort<LuceneField> sort : sorts) {
                boolean reverse = (sort.getSortOrder() == SortOrder.DESC);
                luceneSorts.add(new SortField(sort.getField().getName(), sort.getField().getSortType(), reverse));
            }
            return new Sort(luceneSorts.toArray(new SortField[luceneSorts.size()]));
        }
    }

    /**
     * Performs highlighting for a given query and a given document.
     *
     * @param indexSearcher the IndexSearcher performing the query
     * @param query the Tripod LuceneQuery
     * @param scoreDoc the Lucene ScoreDoc
     * @param doc the Lucene Document
     * @param highlighter the Highlighter to use
     * @param result the QueryResult to add the highlights to
     * @throws IOException if an error occurs performing the highlighting
     * @throws InvalidTokenOffsetsException if an error occurs performing the highlighting
     */
    protected void performHighlighting(final IndexSearcher indexSearcher, final Q query, final ScoreDoc scoreDoc,
            final Document doc, final Highlighter highlighter, final QR result)
            throws IOException, InvalidTokenOffsetsException {

        if (query.getHighlightFields() == null || query.getHighlightFields().isEmpty()) {
            return;
        }

        final List<Highlight> highlights = new ArrayList<>();
        final List<String> hlFieldNames = getHighlightFieldNames(query, doc);

        // process each field to highlight on
        for (String hlField : hlFieldNames) {
            final String text = doc.get(hlField);
            if (StringUtils.isEmpty(text)) {
                continue;
            }

            final List<String> snippets = new ArrayList<>();
            final Fields tvFields = indexSearcher.getIndexReader().getTermVectors(scoreDoc.doc);
            final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;

            // get the snippets for the given field
            final TokenStream tokenStream = TokenSources.getTokenStream(hlField, tvFields, text, analyzer,
                    maxStartOffset);
            final TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, false, 10);
            for (TextFragment textFragment : textFragments) {
                if (textFragment != null && textFragment.getScore() > 0) {
                    snippets.add(textFragment.toString());
                }
            }

            // if we have snippets then add a highlight result to the QueryResult
            if (snippets.size() > 0) {
                highlights.add(new Highlight(hlField, snippets));
            }
        }

        result.setHighlights(highlights);
    }

    /**
     * @param query the query being performed
     * @param doc the doc being highlighted
     * @return the list of field names to highlight on coming from the query, or if the query has
     *              highlight on all fields then we get all the field names fromt he document
     */
    private List<String> getHighlightFieldNames(LuceneQuery query, Document doc) {
        final List<String> hlFieldNames = new ArrayList<>();
        if (query.getHighlightFields().size() == 1
                && query.getHighlightFields().get(0).getName().equals(LuceneField.ALL_LUCENE_FIELDS.getName())) {
            hlFieldNames.addAll(doc.getFields().stream().map(IndexableField::name).collect(Collectors.toList()));
        } else {
            hlFieldNames.addAll(
                    query.getHighlightFields().stream().map(LuceneField::getName).collect(Collectors.toList()));
        }
        return hlFieldNames;
    }

    /**
     * Processes the faceting results and adds them to the QueryResults builder.
     *
     * @param indexSearcher the IndexSearcher performing the query
     * @param facetsCollector the FacetsCollector that was used for the search
     * @param facetFields the fields to Facet on
     * @param resultBuilder the QueryResults.Builder
     * @throws IOException if an error occurs performing faceting
     */
    protected void processFacetResults(final IndexSearcher indexSearcher, final FacetsCollector facetsCollector,
            final Set<String> facetFields, final LuceneQueryResults.Builder<QR> resultBuilder) throws IOException {
        if (facetFields == null) {
            return;
        }

        for (String facetField : facetFields) {
            final List<FacetCount> facetResultCounts = new ArrayList<>();
            final SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(
                    indexSearcher.getIndexReader(), facetField);
            final Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector);

            org.apache.lucene.facet.FacetResult result = facets.getTopChildren(10, facetField);
            for (int i = 0; i < result.childCount; i++) {
                LabelAndValue lv = result.labelValues[i];
                facetResultCounts.add(new FacetCount(lv.label, lv.value.longValue()));
            }

            resultBuilder.addFacetResult(new FacetResult(facetField, facetResultCounts));
        }
    }

}