org.sindice.siren.search.node.NodePhraseQuery.java Source code

Java tutorial

Introduction

Here is the source code for org.sindice.siren.search.node.NodePhraseQuery.java

Source

/**
 * Copyright 2014 National University of Ireland, Galway.
 *
 * This file is part of the SIREn project. Project and contact information:
 *
 *  https://github.com/rdelbru/SIREn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.sindice.siren.search.node;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SloppySimScorer;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ToStringUtils;
import org.sindice.siren.index.DocsNodesAndPositionsEnum;

/**
 * A {@link NodePrimitiveQuery} that matches nodes containing a particular
 * sequence of terms. A {@link NodePhraseQuery} is built for input like
 * <code>"new york"</code>.
 * <p>
 * This query may be combined with other queries with a {@link NodeBooleanQuery}
 * or a {@link TwigQuery}.
 * <p>
 * Code taken from {@link PhraseQuery} and adapted for the Siren use case.
 */
public class NodePhraseQuery extends NodePrimitiveQuery {

    private String field;

    private final ArrayList<Term> terms = new ArrayList<Term>(4);

    private final ArrayList<Integer> positions = new ArrayList<Integer>(4);

    private int maxPosition = 0;

    /** Constructs an empty phrase query. */
    public NodePhraseQuery() {
    }

    /**
     * Adds a term to the end of the query phrase. The relative position of the
     * term is the one immediately after the last term added.
     */
    public void add(final Term term) {
        int position = 0;

        if (positions.size() > 0) {
            position = (positions.get(positions.size() - 1)).intValue() + 1;
        }

        this.add(term, position);
    }

    /**
     * Adds a term to the end of the query phrase. The relative position of the
     * term within the phrase is specified explicitly. This allows e.g. phrases
     * with more than one term at the same position or phrases with gaps (e.g. in
     * connection with stopwords).
     *
     * @param term
     * @param position
     */
    public void add(final Term term, final int position) {
        if (terms.size() == 0) {
            field = term.field();
        } else if (term.field() != field) {
            throw new IllegalArgumentException("All phrase terms must be in the same field: " + term);
        }

        terms.add(term);
        positions.add(Integer.valueOf(position));
        if (position > maxPosition)
            maxPosition = position;
    }

    /** Returns the set of terms in this phrase. */
    public Term[] getTerms() {
        return terms.toArray(new Term[0]);
    }

    /**
     * Returns the relative positions of terms in this phrase.
     */
    public int[] getPositions() {
        final int[] result = new int[positions.size()];
        for (int i = 0; i < positions.size(); i++) {
            result[i] = (positions.get(i)).intValue();
        }
        return result;
    }

    @Override
    public Query rewrite(final IndexReader reader) throws IOException {
        if (terms.isEmpty()) {
            final NodeBooleanQuery bq = new NodeBooleanQuery();
            bq.setBoost(this.getBoost());
            return bq;
        } else if (terms.size() == 1) {
            final NodeTermQuery tq = new NodeTermQuery(terms.get(0));
            tq.setBoost(this.getBoost());
            return tq;
        } else {
            return super.rewrite(reader);
        }
    }

    private class NodePhraseWeight extends Weight {

        private final Similarity similarity;
        private final Similarity.SimWeight stats;
        private transient TermContext states[];

        public NodePhraseWeight(final IndexSearcher searcher) throws IOException {
            this.similarity = searcher.getSimilarity();
            final IndexReaderContext context = searcher.getTopReaderContext();
            states = new TermContext[terms.size()];
            final TermStatistics termStats[] = new TermStatistics[terms.size()];
            for (int i = 0; i < terms.size(); i++) {
                final Term term = terms.get(i);
                states[i] = TermContext.build(context, term, true);
                termStats[i] = searcher.termStatistics(term, states[i]);
            }
            stats = similarity.computeWeight(NodePhraseQuery.this.getBoost(), searcher.collectionStatistics(field),
                    termStats);
        }

        @Override
        public String toString() {
            return "weight(" + NodePhraseQuery.this + ")";
        }

        @Override
        public Query getQuery() {
            return NodePhraseQuery.this;
        }

        @Override
        public float getValueForNormalization() {
            return stats.getValueForNormalization();
        }

        @Override
        public void normalize(final float queryNorm, final float topLevelBoost) {
            stats.normalize(queryNorm, topLevelBoost);
        }

        @Override
        public Scorer scorer(final AtomicReaderContext context, final boolean scoreDocsInOrder,
                final boolean topScorer, final Bits acceptDocs) throws IOException {
            assert !terms.isEmpty();
            final AtomicReader reader = context.reader();
            final Bits liveDocs = acceptDocs;
            final PostingsAndPosition[] postings = new PostingsAndPosition[terms.size()];

            final Terms fieldTerms = reader.terms(field);
            if (fieldTerms == null) {
                return null;
            }

            // Reuse single TermsEnum below:
            final TermsEnum te = fieldTerms.iterator(null);

            for (int i = 0; i < terms.size(); i++) {
                final Term t = terms.get(i);
                final TermState state = states[i].get(context.ord);
                if (state == null) { /* term doesnt exist in this segment */
                    assert this.termNotInReader(reader, t) : "no termstate found but term exists in reader";
                    return null;
                }
                te.seekExact(t.bytes(), state);

                final DocsNodesAndPositionsEnum postingsEnum = NodePhraseQuery.this
                        .getDocsNodesAndPositionsEnum(te.docsAndPositions(liveDocs, null));

                // PhraseQuery on a field that did not index positions (maybe not a siren field)
                if (postingsEnum == null) {
                    assert te.seekExact(t.bytes(), false) : "termstate found but no term exists in reader";
                    // term does exist, but has no positions
                    throw new IllegalStateException("field \"" + t.field() + "\" was "
                            + "indexed without position data; cannot run NodePhraseQuery " + "(term=" + t.text()
                            + ")");
                }
                postings[i] = new PostingsAndPosition(postingsEnum, positions.get(i).intValue());
            }

            return new NodeExactPhraseScorer(this, postings, similarity.sloppySimScorer(stats, context),
                    similarity.exactSimScorer(stats, context));
        }

        // TODO: Review this explanation for node match
        @Override
        public Explanation explain(final AtomicReaderContext context, final int doc) throws IOException {
            final NodeScorer scorer = (NodeScorer) this.scorer(context, true, false,
                    context.reader().getLiveDocs());
            if (scorer != null) {
                if (scorer.skipToCandidate(doc) && scorer.doc() == doc) {
                    final SloppySimScorer docScorer = similarity.sloppySimScorer(stats, context);
                    final ComplexExplanation result = new ComplexExplanation();
                    result.setDescription("weight(" + this.getQuery() + " in " + doc + ") ["
                            + similarity.getClass().getSimpleName() + "], result of:");
                    while (scorer.nextNode()) {
                        final ComplexExplanation nodeMatch = new ComplexExplanation();
                        nodeMatch.setDescription("in " + scorer.node() + "), result of:");
                        final float freq = scorer.freqInNode();
                        final Explanation scoreExplanation = docScorer.explain(doc,
                                new Explanation(freq, "phraseFreq=" + freq));
                        nodeMatch.setValue(scoreExplanation.getValue());
                        nodeMatch.setMatch(true);
                        nodeMatch.addDetail(scoreExplanation);
                        result.addDetail(nodeMatch);
                    }
                    result.setMatch(true);
                    return result;
                }
            }

            return new ComplexExplanation(false, 0.0f, "no matching term");
        }

        // only called from assert
        private boolean termNotInReader(final AtomicReader reader, final Term term) throws IOException {
            return reader.docFreq(term) == 0;
        }

    }

    @Override
    public Weight createWeight(final IndexSearcher searcher) throws IOException {
        return new NodePhraseWeight(searcher);
    }

    /**
     * @see org.apache.lucene.search.Query#extractTerms(java.util.Set)
     */
    @Override
    public void extractTerms(final Set<Term> queryTerms) {
        queryTerms.addAll(terms);
    }

    @Override
    public String toString(final String f) {
        final StringBuffer buffer = new StringBuffer();

        buffer.append("\"");
        final String[] pieces = new String[maxPosition + 1];
        for (int i = 0; i < terms.size(); i++) {
            final int pos = (positions.get(i)).intValue();
            String s = pieces[pos];
            if (s == null) {
                s = (terms.get(i)).text();
            } else {
                s = s + "|" + (terms.get(i)).text();
            }
            pieces[pos] = s;
        }
        for (int i = 0; i < pieces.length; i++) {
            if (i > 0) {
                buffer.append(' ');
            }
            final String s = pieces[i];
            if (s == null) {
                buffer.append('?');
            } else {
                this.escapeDoubleQuote(buffer, s);
            }
        }
        buffer.append("\"");

        buffer.append(ToStringUtils.boost(this.getBoost()));

        return this.wrapToStringWithDatatype(buffer).toString();
    }

    /**
     * Prefix with a backslash any unescaped double quote
     */
    private void escapeDoubleQuote(final StringBuffer buffer, final String s) {
        int index = 0;
        int prevIndex = 0;

        while ((index = s.indexOf('"', prevIndex)) != -1) {
            buffer.append(s, prevIndex, index);
            if (buffer.charAt(buffer.length() - 1) != '\\') {
                buffer.append('\\');
            }
            buffer.append('"');
            prevIndex = index + 1;
        }
        buffer.append(s, prevIndex, s.length());
    }

    @Override
    public boolean equals(final Object o) {
        if (!(o instanceof NodePhraseQuery)) {
            return false;
        }
        final NodePhraseQuery other = (NodePhraseQuery) o;
        return (this.getBoost() == other.getBoost()) && this.terms.equals(other.terms)
                && this.positions.equals(other.positions) && this.levelConstraint == other.levelConstraint
                && this.lowerBound == other.lowerBound && this.upperBound == other.upperBound
                && StringUtils.equals(this.datatype, other.datatype);
    }

    @Override
    public int hashCode() {
        return Float.floatToIntBits(this.getBoost()) ^ terms.hashCode() ^ positions.hashCode() ^ levelConstraint
                ^ upperBound ^ lowerBound;
    }

    static class PostingsAndPosition {

        final DocsNodesAndPositionsEnum postings;
        final int position;

        public PostingsAndPosition(final DocsNodesAndPositionsEnum postings, final int position) {
            this.postings = postings;
            this.position = position;
        }

    }

}