com.greplin.lucene.filter.PhraseFilter.java Source code

Introduction

Here is the source code for com.greplin.lucene.filter.PhraseFilter.java
Source

/*
 * Copyright 2013 The greplin-lucene-utils Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.greplin.lucene.filter;

import com.google.common.base.Objects;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import com.greplin.lucene.index.IndexReaders;
import com.greplin.lucene.util.AllDocsIntersectionProvider;
import com.greplin.lucene.util.Intersection;
import com.greplin.lucene.util.IntersectionProvider;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.FixedBitSet;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.SortedSet;

/**
 * Filters for documents matching a phrase.
 *
 * Differences from PhraseQuery:
 * - faster with less features
 * - supports a single field only
 * - does not compute score
 * - does not support slop
 *
 * Additional features:
 * - supports AND type intersection queries, this will perform
 *   much better than an external BooleanFilter
 *
 * Optimization notes:
 * - Using a shared TermPositions with seeking saves about 10% !
 */
public class PhraseFilter extends Filter {

    /**
     * The terms comprising the phrase.
     */
    private final Term[] terms;

    /**
     * The intersection provider.
     */
    private final IntersectionProvider intersectionProvider;

    /**
     * Construct a new phrase filter.
     * @param intersectionProvider other doc id set to intersect with
     * @param terms the terms in the phrase
     */
    public PhraseFilter(final IntersectionProvider intersectionProvider, final Term... terms) {
        this.terms = Arrays.copyOf(terms, terms.length);
        this.intersectionProvider = intersectionProvider;
    }

    /**
     * Construct a new phrase filter.
     * @param terms the terms in the phrase
     */
    public PhraseFilter(final Term... terms) {
        this(AllDocsIntersectionProvider.INSTANCE, terms);
    }

    /**
     * Construct a new phrase filter.
     * @param field the field to find phrases in
     * @param terms the terms in the phrase
     */
    public PhraseFilter(final String field, final String... terms) {
        this(convertToTerms(field, terms));
    }

    /**
     * Construct a new phrase filter.
     * @param intersectionProvider other doc id set to intersect with
     * @param field the field to find phrases in
     * @param terms the terms in the phrase
     */
    public PhraseFilter(final IntersectionProvider intersectionProvider, final String field,
            final String... terms) {
        this(intersectionProvider, convertToTerms(field, terms));
    }

    /**
     * Internal utility method that converts a field name and set of values to
     * an array of terms.
     * @param field the field
     * @param values the values
     * @return array of terms, one per value, each with the given field
     */
    private static Term[] convertToTerms(final String field, final String... values) {
        Term[] terms = new Term[values.length];
        for (int i = 0; i < values.length; i++) {
            terms[i] = new Term(field, values[i]);
        }
        return terms;
    }

    @Override
    public boolean equals(final Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }

        PhraseFilter that = (PhraseFilter) o;
        return this.intersectionProvider.equals(that.intersectionProvider) && Arrays.equals(this.terms, that.terms);
    }

    @Override
    public int hashCode() {
        return Objects.hashCode(Arrays.hashCode(this.terms), this.intersectionProvider);
    }

    @Override
    public String toString() {
        return "PhraseFilter{" + "terms=" + Arrays.toString(this.terms) + ", intersectionProvider="
                + this.intersectionProvider + '}';
    }

    @Override
    public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
        List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
        PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()];
        int matchCount = 0;
        int readerNumber = 0;

        for (IndexReader subReader : subReaders) {
            SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
            for (int i = 0; i < this.terms.length; i++) {
                Term t = this.terms[i];
                termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i));
            }

            PhraseFilterMatchList matches = null;
            TermPositions termPositions = subReader.termPositions();
            try {
                for (TermWithFrequency term : termsOrderedByFrequency) {
                    if (term.docFreq == 0) {
                        break;
                    }

                    termPositions.seek(term.term);

                    if (matches == null) {
                        // If this is the first term, collect all matches that intersect
                        // with the provided initial document set.
                        Intersection intersection = this.intersectionProvider.get(reader);

                        matches = new PhraseFilterMatchList(term.docFreq);
                        while (intersection.advanceToNextIntersection(termPositions)) {
                            int freq = termPositions.freq();
                            PhraseFilterIntList list = new PhraseFilterIntList(freq);
                            for (int i = 0; i < freq; i++) {
                                list.add(termPositions.nextPosition() - term.offset);
                            }
                            matches.add(termPositions.doc(), list);
                        }
                    } else {
                        // Otherwise, intersect with the existing matches.
                        matches.intersect(termPositions, term.offset);
                    }

                    if (matches.getCount() == 0) {
                        break;
                    }
                }
            } finally {
                termPositions.close();
            }

            if (matches != null) {
                results[readerNumber] = matches;
                matchCount += matches.getCount();
            }
            readerNumber++;
        }

        final int bitsPerIntPowerLogTwo = 5; // 2^5 = 32
        if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
            FixedBitSet result = new FixedBitSet(reader.maxDoc());
            int readerOffset = 0;
            for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
                PhraseFilterMatchList matches = results[readerIndex];
                if (matches != null) {
                    int count = matches.getCount();
                    int[] docIds = matches.getDocIds();
                    for (int i = 0; i < count; i++) {
                        result.set(docIds[i] + readerOffset);
                    }
                }
                readerOffset += subReaders.get(readerIndex).maxDoc();
            }
            return result;
        } else if (matchCount == 0) {
            return DocIdSets.EMPTY;
        } else {
            int[] result = new int[matchCount];
            int base = 0;
            int readerOffset = 0;
            for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
                PhraseFilterMatchList matches = results[readerIndex];
                if (matches != null) {
                    int count = matches.getCount();
                    int[] docIds = matches.getDocIds();
                    for (int i = 0; i < count; i++) {
                        result[base + i] = docIds[i] + readerOffset;
                    }
                    base += count;
                }
                readerOffset += subReaders.get(readerIndex).maxDoc();
            }
            return new SortedIntArrayDocIdSet(result);
        }
    }

    /**
     * DocId set based on a sorted array of integers.
     * The integer array is not defensively copied - so don't modify it!
     */
    private static final class SortedIntArrayDocIdSet extends DocIdSet {

        /**
         * The sorted array of integers.
         */
        private final int[] ints;

        /**
         * Constructs a new doc id set.
         * @param ints sorted array of integers
         */
        private SortedIntArrayDocIdSet(final int[] ints) {
            this.ints = ints;
        }

        @Override
        public DocIdSetIterator iterator() throws IOException {
            return new SortedIntArrayDocIdSetIterator(this.ints);
        }

        @Override
        public boolean isCacheable() {
            return true;
        }

    }

    /**
     * Iterator for sorted integer array.
     */
    private static final class SortedIntArrayDocIdSetIterator extends DocIdSetIterator {

        /**
         * The list of integers.
         */
        private final int[] ints;

        /**
         * The active index.
         */
        private int index = -1;

        /**
         * Constructs an iterator over a sorted integer array.
         * @param ints the array of integers
         */
        private SortedIntArrayDocIdSetIterator(final int[] ints) {
            this.ints = ints;
        }

        @Override
        public int docID() {
            return this.index < this.ints.length ? this.ints[this.index] : NO_MORE_DOCS;
        }

        @Override
        public int nextDoc() throws IOException {
            ++this.index;
            return this.index < this.ints.length ? this.ints[this.index] : NO_MORE_DOCS;
        }

        @Override
        public int advance(final int target) throws IOException {
            // TODO(robbyw): Consider doing binary search here.
            // Though, in practice, the array is probably small enough that this
            // would actually be slower.
            while (docID() < target) {
                nextDoc();
            }
            return docID();
        }

    }

    /**
     * A term with a frequency and offset.
     */
    private static final class TermWithFrequency implements Comparable<TermWithFrequency> {

        /**
         * The term.
         */
        private final Term term;

        /**
         * Its frequency.
         */
        private final int docFreq;

        /**
         * Offset within the phrase.
         */
        private final int offset;

        /**
         * Construct a term with frequency struct.
         * @param term the term
         * @param docFreq its frequency
         * @param offset offset within the phrase
         */
        private TermWithFrequency(final Term term, final int docFreq, final int offset) {
            this.term = term;
            this.docFreq = docFreq;
            this.offset = offset;
        }

        @Override
        public int compareTo(final TermWithFrequency o) {
            int first = Ints.compare(this.docFreq, o.docFreq);
            return first == 0 ? Ints.compare(this.offset, o.offset) : first;
        }

    }

}