org.apache.lucene.search.TopScoreDocCollector.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.search.TopScoreDocCollector.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.util.Collection;

import org.apache.lucene.index.LeafReaderContext;

/**
 * A {@link Collector} implementation that collects the top-scoring hits,
 * returning them as a {@link TopDocs}. This is used by {@link IndexSearcher} to
 * implement {@link TopDocs}-based search. Hits are sorted by score descending
 * and then (when the scores are tied) docID ascending. When you create an
 * instance of this collector you should know in advance whether documents are
 * going to be collected in doc Id order or not.
 *
 * <p><b>NOTE</b>: The values {@link Float#NaN} and
 * {@link Float#NEGATIVE_INFINITY} are not valid scores.  This
 * collector will not properly collect hits with such
 * scores.
 */
public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {

    abstract static class ScorerLeafCollector implements LeafCollector {

        Scorable scorer;

        @Override
        public void setScorer(Scorable scorer) throws IOException {
            this.scorer = scorer;
        }
    }

    private static class SimpleTopScoreDocCollector extends TopScoreDocCollector {

        SimpleTopScoreDocCollector(int numHits, HitsThresholdChecker hitsThresholdChecker) {
            super(numHits, hitsThresholdChecker);
        }

        @Override
        public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
            final int docBase = context.docBase;
            return new ScorerLeafCollector() {

                @Override
                public void setScorer(Scorable scorer) throws IOException {
                    super.setScorer(scorer);
                    updateMinCompetitiveScore(scorer);
                }

                @Override
                public void collect(int doc) throws IOException {
                    float score = scorer.score();

                    // This collector relies on the fact that scorers produce positive values:
                    assert score >= 0; // NOTE: false for NaN

                    totalHits++;
                    hitsThresholdChecker.incrementHitCount();

                    if (score <= pqTop.score) {
                        if (totalHitsRelation == TotalHits.Relation.EQUAL_TO
                                && hitsThresholdChecker.isThresholdReached()) {
                            // we just reached totalHitsThreshold, we can start setting the min
                            // competitive score now
                            updateMinCompetitiveScore(scorer);
                        }
                        // Since docs are returned in-order (i.e., increasing doc Id), a document
                        // with equal score to pqTop.score cannot compete since HitQueue favors
                        // documents with lower doc Ids. Therefore reject those docs too.
                        return;
                    }
                    pqTop.doc = doc + docBase;
                    pqTop.score = score;
                    pqTop = pq.updateTop();
                    updateMinCompetitiveScore(scorer);
                }

            };
        }
    }

    private static class PagingTopScoreDocCollector extends TopScoreDocCollector {

        private final ScoreDoc after;
        private int collectedHits;

        PagingTopScoreDocCollector(int numHits, ScoreDoc after, HitsThresholdChecker hitsThresholdChecker) {
            super(numHits, hitsThresholdChecker);
            this.after = after;
            this.collectedHits = 0;
        }

        @Override
        protected int topDocsSize() {
            return collectedHits < pq.size() ? collectedHits : pq.size();
        }

        @Override
        protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
            return results == null ? new TopDocs(new TotalHits(totalHits, totalHitsRelation), new ScoreDoc[0])
                    : new TopDocs(new TotalHits(totalHits, totalHitsRelation), results);
        }

        @Override
        public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
            final int docBase = context.docBase;
            final int afterDoc = after.doc - context.docBase;

            return new ScorerLeafCollector() {
                @Override
                public void collect(int doc) throws IOException {
                    float score = scorer.score();

                    // This collector relies on the fact that scorers produce positive values:
                    assert score >= 0; // NOTE: false for NaN

                    totalHits++;
                    hitsThresholdChecker.incrementHitCount();

                    if (score > after.score || (score == after.score && doc <= afterDoc)) {
                        // hit was collected on a previous page
                        if (totalHitsRelation == TotalHits.Relation.EQUAL_TO
                                && hitsThresholdChecker.isThresholdReached()) {
                            // we just reached totalHitsThreshold, we can start setting the min
                            // competitive score now
                            updateMinCompetitiveScore(scorer);
                        }
                        return;
                    }

                    if (score <= pqTop.score) {
                        // Since docs are returned in-order (i.e., increasing doc Id), a document
                        // with equal score to pqTop.score cannot compete since HitQueue favors
                        // documents with lower doc Ids. Therefore reject those docs too.
                        return;
                    }
                    collectedHits++;
                    pqTop.doc = doc + docBase;
                    pqTop.score = score;
                    pqTop = pq.updateTop();
                    updateMinCompetitiveScore(scorer);
                }
            };
        }
    }

    /**
     * Creates a new {@link TopScoreDocCollector} given the number of hits to
     * collect and the number of hits to count accurately.
     *
     * <p><b>NOTE</b>: If the total hit count of the top docs is less than or exactly
     * {@code totalHitsThreshold} then this value is accurate. On the other hand,
     * if the {@link TopDocs#totalHits} value is greater than {@code totalHitsThreshold}
     * then its value is a lower bound of the hit count. A value of {@link Integer#MAX_VALUE}
     * will make the hit count accurate but will also likely make query processing slower.
     * <p><b>NOTE</b>: The instances returned by this method
     * pre-allocate a full array of length
     * <code>numHits</code>, and fill the array with sentinel
     * objects.
     */
    public static TopScoreDocCollector create(int numHits, int totalHitsThreshold) {
        return create(numHits, null, totalHitsThreshold);
    }

    /**
     * Creates a new {@link TopScoreDocCollector} given the number of hits to
     * collect, the bottom of the previous page, and the number of hits to count
     * accurately.
     *
     * <p><b>NOTE</b>: If the total hit count of the top docs is less than or exactly
     * {@code totalHitsThreshold} then this value is accurate. On the other hand,
     * if the {@link TopDocs#totalHits} value is greater than {@code totalHitsThreshold}
     * then its value is a lower bound of the hit count. A value of {@link Integer#MAX_VALUE}
     * will make the hit count accurate but will also likely make query processing slower.
     * <p><b>NOTE</b>: The instances returned by this method
     * pre-allocate a full array of length
     * <code>numHits</code>, and fill the array with sentinel
     * objects.
     */
    public static TopScoreDocCollector create(int numHits, ScoreDoc after, int totalHitsThreshold) {
        return create(numHits, after, HitsThresholdChecker.create(totalHitsThreshold));
    }

    static TopScoreDocCollector create(int numHits, ScoreDoc after, HitsThresholdChecker hitsThresholdChecker) {

        if (numHits <= 0) {
            throw new IllegalArgumentException(
                    "numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
        }

        if (hitsThresholdChecker == null) {
            throw new IllegalArgumentException("hitsThresholdChecker must be non null");
        }

        if (after == null) {
            return new SimpleTopScoreDocCollector(numHits, hitsThresholdChecker);
        } else {
            return new PagingTopScoreDocCollector(numHits, after, hitsThresholdChecker);
        }
    }

    /**
     * Create a CollectorManager which uses a shared hit counter to maintain number of hits
     */
    public static CollectorManager<TopScoreDocCollector, TopDocs> createSharedManager(int numHits, FieldDoc after,
            int totalHitsThreshold) {
        return new CollectorManager<TopScoreDocCollector, TopDocs>() {

            private final HitsThresholdChecker hitsThresholdChecker = HitsThresholdChecker
                    .createShared(totalHitsThreshold);

            @Override
            public TopScoreDocCollector newCollector() throws IOException {
                return TopScoreDocCollector.create(numHits, after, hitsThresholdChecker);
            }

            @Override
            public TopDocs reduce(Collection<TopScoreDocCollector> collectors) throws IOException {
                final TopDocs[] topDocs = new TopDocs[collectors.size()];
                int i = 0;
                for (TopScoreDocCollector collector : collectors) {
                    topDocs[i++] = collector.topDocs();
                }
                return TopDocs.merge(numHits, topDocs);
            }

        };
    }

    ScoreDoc pqTop;
    final HitsThresholdChecker hitsThresholdChecker;

    // prevents instantiation
    TopScoreDocCollector(int numHits, HitsThresholdChecker hitsThresholdChecker) {
        super(new HitQueue(numHits, true));
        assert hitsThresholdChecker != null;

        // HitQueue implements getSentinelObject to return a ScoreDoc, so we know
        // that at this point top() is already initialized.
        pqTop = pq.top();
        this.hitsThresholdChecker = hitsThresholdChecker;
    }

    @Override
    protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
        if (results == null) {
            return EMPTY_TOPDOCS;
        }

        return new TopDocs(new TotalHits(totalHits, totalHitsRelation), results);
    }

    @Override
    public ScoreMode scoreMode() {
        return hitsThresholdChecker.scoreMode();
    }

    protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
        if (hitsThresholdChecker.isThresholdReached() && pqTop != null && pqTop.score != Float.NEGATIVE_INFINITY) { // -Infinity is the score of sentinels
            // since we tie-break on doc id and collect in doc id order, we can require
            // the next float
            scorer.setMinCompetitiveScore(Math.nextUp(pqTop.score));
            totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
        }
    }
}