org.apache.lucene.search.CachingCollector.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.search.CachingCollector.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.util.ArrayUtil;

/**
 * Caches all docs, and optionally also scores, coming from
 * a search, and is then able to replay them to another
 * collector.  You specify the max RAM this class may use.
 * Once the collection is done, call {@link #isCached}. If
 * this returns true, you can use {@link #replay(Collector)}
 * against a new collector.  If it returns false, this means
 * too much RAM was required and you must instead re-run the
 * original search.
 *
 * <p><b>NOTE</b>: this class consumes 4 (or 8 bytes, if
 * scoring is cached) per collected document.  If the result
 * set is large this can easily be a very substantial amount
 * of RAM!
 *
 * <p>See the Lucene <tt>modules/grouping</tt> module for more
 * details including a full code example.</p>
 *
 * @lucene.experimental
 */
public abstract class CachingCollector extends FilterCollector {

    private static final int INITIAL_ARRAY_SIZE = 128;

    private static final class CachedScorable extends Scorable {

        // NOTE: these members are package-private b/c that way accessing them from
        // the outer class does not incur access check by the JVM. The same
        // situation would be if they were defined in the outer class as private
        // members.
        int doc;
        float score;

        @Override
        public final float score() {
            return score;
        }

        @Override
        public int docID() {
            return doc;
        }

    }

    private static class NoScoreCachingCollector extends CachingCollector {

        List<LeafReaderContext> contexts;
        List<int[]> docs;
        int maxDocsToCache;
        NoScoreCachingLeafCollector lastCollector;

        NoScoreCachingCollector(Collector in, int maxDocsToCache) {
            super(in);
            this.maxDocsToCache = maxDocsToCache;
            contexts = new ArrayList<>();
            docs = new ArrayList<>();
        }

        protected NoScoreCachingLeafCollector wrap(LeafCollector in, int maxDocsToCache) {
            return new NoScoreCachingLeafCollector(in, maxDocsToCache);
        }

        // note: do *not* override needScore to say false. Just because we aren't caching the score doesn't mean the
        //   wrapped collector doesn't need it to do its job.

        public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
            postCollection();
            final LeafCollector in = this.in.getLeafCollector(context);
            if (contexts != null) {
                contexts.add(context);
            }
            if (maxDocsToCache >= 0) {
                return lastCollector = wrap(in, maxDocsToCache);
            } else {
                return in;
            }
        }

        protected void invalidate() {
            maxDocsToCache = -1;
            contexts = null;
            this.docs = null;
        }

        protected void postCollect(NoScoreCachingLeafCollector collector) {
            final int[] docs = collector.cachedDocs();
            maxDocsToCache -= docs.length;
            this.docs.add(docs);
        }

        private void postCollection() {
            if (lastCollector != null) {
                if (!lastCollector.hasCache()) {
                    invalidate();
                } else {
                    postCollect(lastCollector);
                }
                lastCollector = null;
            }
        }

        protected void collect(LeafCollector collector, int i) throws IOException {
            final int[] docs = this.docs.get(i);
            for (int doc : docs) {
                collector.collect(doc);
            }
        }

        public void replay(Collector other) throws IOException {
            postCollection();
            if (!isCached()) {
                throw new IllegalStateException(
                        "cannot replay: cache was cleared because too much RAM was required");
            }
            assert docs.size() == contexts.size();
            for (int i = 0; i < contexts.size(); ++i) {
                final LeafReaderContext context = contexts.get(i);
                final LeafCollector collector = other.getLeafCollector(context);
                collect(collector, i);
            }
        }

    }

    private static class ScoreCachingCollector extends NoScoreCachingCollector {

        List<float[]> scores;

        ScoreCachingCollector(Collector in, int maxDocsToCache) {
            super(in, maxDocsToCache);
            scores = new ArrayList<>();
        }

        protected NoScoreCachingLeafCollector wrap(LeafCollector in, int maxDocsToCache) {
            return new ScoreCachingLeafCollector(in, maxDocsToCache);
        }

        @Override
        protected void postCollect(NoScoreCachingLeafCollector collector) {
            final ScoreCachingLeafCollector coll = (ScoreCachingLeafCollector) collector;
            super.postCollect(coll);
            scores.add(coll.cachedScores());
        }

        /** Ensure the scores are collected so they can be replayed, even if the wrapped collector doesn't need them. */
        @Override
        public ScoreMode scoreMode() {
            return ScoreMode.COMPLETE;
        }

        @Override
        protected void collect(LeafCollector collector, int i) throws IOException {
            final int[] docs = this.docs.get(i);
            final float[] scores = this.scores.get(i);
            assert docs.length == scores.length;
            final CachedScorable scorer = new CachedScorable();
            collector.setScorer(scorer);
            for (int j = 0; j < docs.length; ++j) {
                scorer.doc = docs[j];
                scorer.score = scores[j];
                collector.collect(scorer.doc);
            }
        }
    }

    private class NoScoreCachingLeafCollector extends FilterLeafCollector {

        final int maxDocsToCache;
        int[] docs;
        int docCount;

        NoScoreCachingLeafCollector(LeafCollector in, int maxDocsToCache) {
            super(in);
            this.maxDocsToCache = maxDocsToCache;
            docs = new int[Math.min(maxDocsToCache, INITIAL_ARRAY_SIZE)];
            docCount = 0;
        }

        protected void grow(int newLen) {
            docs = ArrayUtil.growExact(docs, newLen);
        }

        protected void invalidate() {
            docs = null;
            docCount = -1;
            cached = false;
        }

        protected void buffer(int doc) throws IOException {
            docs[docCount] = doc;
        }

        @Override
        public void collect(int doc) throws IOException {
            if (docs != null) {
                if (docCount >= docs.length) {
                    if (docCount >= maxDocsToCache) {
                        invalidate();
                    } else {
                        final int newLen = Math.min(ArrayUtil.oversize(docCount + 1, Integer.BYTES),
                                maxDocsToCache);
                        grow(newLen);
                    }
                }
                if (docs != null) {
                    buffer(doc);
                    ++docCount;
                }
            }
            super.collect(doc);
        }

        boolean hasCache() {
            return docs != null;
        }

        int[] cachedDocs() {
            return docs == null ? null : ArrayUtil.copyOfSubArray(docs, 0, docCount);
        }

    }

    private class ScoreCachingLeafCollector extends NoScoreCachingLeafCollector {

        Scorable scorer;
        float[] scores;

        ScoreCachingLeafCollector(LeafCollector in, int maxDocsToCache) {
            super(in, maxDocsToCache);
            scores = new float[docs.length];
        }

        @Override
        public void setScorer(Scorable scorer) throws IOException {
            this.scorer = scorer;
            super.setScorer(scorer);
        }

        @Override
        protected void grow(int newLen) {
            super.grow(newLen);
            scores = ArrayUtil.growExact(scores, newLen);
        }

        @Override
        protected void invalidate() {
            super.invalidate();
            scores = null;
        }

        @Override
        protected void buffer(int doc) throws IOException {
            super.buffer(doc);
            scores[docCount] = scorer.score();
        }

        float[] cachedScores() {
            return docs == null ? null : ArrayUtil.copyOfSubArray(scores, 0, docCount);
        }
    }

    /**
     * Creates a {@link CachingCollector} which does not wrap another collector.
     * The cached documents and scores can later be {@link #replay(Collector)
     * replayed}.
     */
    public static CachingCollector create(boolean cacheScores, double maxRAMMB) {
        Collector other = new SimpleCollector() {

            @Override
            public void collect(int doc) {
            }

            @Override
            public ScoreMode scoreMode() {
                return ScoreMode.COMPLETE;
            }

        };
        return create(other, cacheScores, maxRAMMB);
    }

    /**
     * Create a new {@link CachingCollector} that wraps the given collector and
     * caches documents and scores up to the specified RAM threshold.
     *
     * @param other
     *          the Collector to wrap and delegate calls to.
     * @param cacheScores
     *          whether to cache scores in addition to document IDs. Note that
     *          this increases the RAM consumed per doc
     * @param maxRAMMB
     *          the maximum RAM in MB to consume for caching the documents and
     *          scores. If the collector exceeds the threshold, no documents and
     *          scores are cached.
     */
    public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
        int bytesPerDoc = Integer.BYTES;
        if (cacheScores) {
            bytesPerDoc += Float.BYTES;
        }
        final int maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
        return create(other, cacheScores, maxDocsToCache);
    }

    /**
     * Create a new {@link CachingCollector} that wraps the given collector and
     * caches documents and scores up to the specified max docs threshold.
     *
     * @param other
     *          the Collector to wrap and delegate calls to.
     * @param cacheScores
     *          whether to cache scores in addition to document IDs. Note that
     *          this increases the RAM consumed per doc
     * @param maxDocsToCache
     *          the maximum number of documents for caching the documents and
     *          possible the scores. If the collector exceeds the threshold,
     *          no documents and scores are cached.
     */
    public static CachingCollector create(Collector other, boolean cacheScores, int maxDocsToCache) {
        return cacheScores ? new ScoreCachingCollector(other, maxDocsToCache)
                : new NoScoreCachingCollector(other, maxDocsToCache);
    }

    private boolean cached;

    private CachingCollector(Collector in) {
        super(in);
        cached = true;
    }

    /**
     * Return true is this collector is able to replay collection.
     */
    public final boolean isCached() {
        return cached;
    }

    /**
     * Replays the cached doc IDs (and scores) to the given Collector. If this
     * instance does not cache scores, then Scorer is not set on
     * {@code other.setScorer} as well as scores are not replayed.
     *
     * @throws IllegalStateException
     *           if this collector is not cached (i.e., if the RAM limits were too
     *           low for the number of documents + scores to cache).
     * @throws IllegalArgumentException
     *           if the given Collect's does not support out-of-order collection,
     *           while the collector passed to the ctor does.
     */
    public abstract void replay(Collector other) throws IOException;

}