de.spartusch.nasfvi.server.NSearcher.java Source code

Introduction

Here is the source code for de.spartusch.nasfvi.server.NSearcher.java
Source

/*
 * Copyright 2011 Stefan Partusch
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.spartusch.nasfvi.server;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.NullFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.similar.MoreLikeThis;

/**
 * Searches the index with a {@link NQuery}.
 * @author Stefan Partusch
 *
 */
public class NSearcher {
    private static final Logger LOGGER = Logger.getLogger(NSearcher.class.getName());

    /** Names of fields to perform similarity searches on. */
    private static final String[] SIMILARITY_FIELDS = new String[] { "titel", "beschreibung" };
    /** IndexSearcher that forms the basis of this searcher. */
    private final IndexSearcher searcher;

    /**
     * Creates a new NSearcher based on a Lucene IndexSearcher.
     * @param searcher IndexSearcher to form the basis of the new NSearcher
     */
    public NSearcher(final IndexSearcher searcher) {
        this.searcher = searcher;
    }

    /**
     * Gets the values of fields required for creating an answer in
     * natural language.
     * @param nquery NQuery used to retrieve <code>result</code>
     * @param result Documents matching <code>nquery</code>
     * @param offset Index of the first document to process
     * @return A mapping from the names of fields to sets of extracted values
     * @throws IOException if there is an IOException when accessing the index
     */
    public final Map<String, Set<String>> getAnswerValues(final NQuery nquery, final TopDocs result, int offset)
            throws IOException {
        Map<String, Set<String>> values = new HashMap<String, Set<String>>();
        Set<String> answerFields = nquery.getFieldsToAnswer();

        if (result.totalHits == 0 || answerFields.size() == 0) {
            return values;
        }

        if (offset >= result.scoreDocs.length) {
            offset = result.scoreDocs.length - 1;
        }

        String firstField = answerFields.iterator().next();

        if (answerFields.size() == 1 && !NQuery.isFieldToCollapse(firstField)) {
            // Aggregate all ScoreDocs (1 field, n documents)
            Set<String> hs = new HashSet<String>(result.scoreDocs.length);
            for (ScoreDoc sd : result.scoreDocs) {
                Document doc = searcher.doc(sd.doc);
                hs.addAll(extractValues(nquery, doc, firstField));
            }
            values.put(firstField, hs);
        } else {
            // Process first ScoreDoc only (n fields, 1 document)
            Document doc = searcher.doc(result.scoreDocs[offset].doc);
            for (String field : answerFields) {
                values.put(field, extractValues(nquery, doc, field));
            }
        }

        return values;
    }

    /**
     * Extracts a field's values from a document. This method is aware of
     * <i>collapsed</i> or <i>merged</i> fields and handles them properly. 
     * @param nquery NQuery used for searching
     * @param doc Document to extract the field's values from
     * @param field Name of the field to extract values for
     * @return Set of extracted values
     */
    private Set<String> extractValues(final NQuery nquery, final Document doc, final String field) {
        Set<String> values = new HashSet<String>();

        if (NQuery.isFieldToCollapse(field)) {
            // process merged field
            String mfield = NQuery.getMergedField();
            QueryScorer scorer = new QueryScorer(nquery.getQuery(), mfield);
            Highlighter highlighter = new Highlighter(scorer);
            highlighter.setTextFragmenter(new NullFragmenter());

            try {
                Set<String> buffer = new HashSet<String>();

                for (Fieldable f : doc.getFieldables(mfield)) {
                    String content = f.stringValue();
                    String value = normalizeValue(NQuery.extractValue(field, content));

                    // Test if the field was matched by the query
                    TokenStream ts = TokenSources.getTokenStream(mfield, content, nquery.getAnalyzer());
                    if (highlighter.getBestFragment(ts, content) != null) {
                        values.add(value);
                    } else {
                        // Buffer the value - in case no field matches
                        buffer.add(value);
                    }
                }

                if (values.isEmpty()) {
                    // No field was matched by the query
                    values.addAll(buffer);
                }
            } catch (IOException e) {
                throw new RuntimeException(e);
            } catch (InvalidTokenOffsetsException e) {
                throw new RuntimeException(e);
            }
        } else {
            for (String v : doc.getValues(field)) {
                values.add(normalizeValue(v));
            }
        }

        return values;
    }

    /**
     * Normalizes strings. This implementation normalizes whitespace characters.
     * @param value String to normalize
     * @return Normalized string
     */
    private String normalizeValue(final String value) {
        return value.replaceAll("\\p{Z}+", " ").trim();
    }

    /**
     * Searches the index using a Lucene query.
     * @param query Query to search for
     * @param maxHits Maximum number of documents to search for
     * @return Matching documents
     * @throws IOException if there is an IOException when accessing the index
     */
    private TopDocs search(final Query query, final int maxHits) throws IOException {
        TopDocs result = searcher.search(query, maxHits);
        LOGGER.info("Search: " + query + ";\t(Hits: " + result.totalHits + ")");
        return result;
    }

    /**
     * Searches the index using a {@link NQuery}.
     * @param nquery Query to search for
     * @param offset Offset to use for the search
     * @return Matching documents
     * @throws IOException if there is an IOException when accessing the index
     */
    public final TopDocs search(final NQuery nquery, final int offset) throws IOException {
        Query q = nquery.getQuery();

        if (nquery.hasSimilarityQuery()) {
            Query similQuery = nquery.getSimilarityQuery();
            TopDocs similDocs = search(similQuery, 1);

            if (similDocs.totalHits == 0) {
                return new TopDocs(0, new ScoreDoc[0], 0f);
            }

            int similDocNum = similDocs.scoreDocs[0].doc;
            String similId = searcher.doc(similDocNum).get("id");
            Query exclude = new TermQuery(new Term("id", similId));
            // exclude the document compared to

            MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
            mlt.setFieldNames(SIMILARITY_FIELDS);
            Query moreLikeQuery = mlt.like(similDocNum);

            BooleanQuery booleanQuery = new BooleanQuery();
            booleanQuery.add(q, BooleanClause.Occur.MUST);
            booleanQuery.add(moreLikeQuery, BooleanClause.Occur.MUST);
            booleanQuery.add(exclude, BooleanClause.Occur.MUST_NOT);

            q = booleanQuery;
        }

        return search(q, offset + 5); // return top 5 results
    }

    /**
     * Creates a JSON representation of a {@link NQuery} and the results
     * of a search.
     * @param nquery NQuery to include in the JSON representation
     * @param result Search results to include
     * @param offset Offset to include
     * @return JSON representation including <code>nquery</code>,
     * <code>result</code> and <code>offset</code>
     */
    public final String toJson(final NQuery nquery, final TopDocs result, final int offset) {
        StringBuilder sb = new StringBuilder();
        // {
        // "NQuery": nquery,
        // "Offset": offset,
        // "Hits": totalHits
        // }

        sb.append("{\n\"NQuery\": ").append(nquery.toString()).append(",\n");
        sb.append("\"Offset\": ").append(offset).append(",\n");
        sb.append("\"Hits\": ").append(result.totalHits).append("\n}");

        return sb.toString();
    }
}