org.apache.lucene.util.DocIdSetBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.util.DocIdSetBuilder.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.packed.PackedInts;

/**
 * A builder of {@link DocIdSet}s.  At first it uses a sparse structure to gather
 * documents, and then upgrades to a non-sparse bit set once enough hits match.
 *
 * To add documents, you first need to call {@link #grow} in order to reserve
 * space, and then call {@link BulkAdder#add(int)} on the returned
 * {@link BulkAdder}.
 *
 * @lucene.internal
 */
public final class DocIdSetBuilder {

    /** Utility class to efficiently add many docs in one go.
     *  @see DocIdSetBuilder#grow */
    public static abstract class BulkAdder {
        public abstract void add(int doc);
    }

    private static class FixedBitSetAdder extends BulkAdder {
        final FixedBitSet bitSet;

        FixedBitSetAdder(FixedBitSet bitSet) {
            this.bitSet = bitSet;
        }

        @Override
        public void add(int doc) {
            bitSet.set(doc);
        }
    }

    private static class Buffer {
        int[] array;
        int length;

        Buffer(int length) {
            this.array = new int[length];
            this.length = 0;
        }

        Buffer(int[] array, int length) {
            this.array = array;
            this.length = length;
        }
    }

    private static class BufferAdder extends BulkAdder {
        final Buffer buffer;

        BufferAdder(Buffer buffer) {
            this.buffer = buffer;
        }

        @Override
        public void add(int doc) {
            buffer.array[buffer.length++] = doc;
        }
    }

    private final int maxDoc;
    private final int threshold;
    // pkg-private for testing
    final boolean multivalued;
    final double numValuesPerDoc;

    private List<Buffer> buffers = new ArrayList<>();
    private int totalAllocated; // accumulated size of the allocated buffers

    private FixedBitSet bitSet;

    private long counter = -1;
    private BulkAdder adder;

    /**
     * Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
     */
    public DocIdSetBuilder(int maxDoc) {
        this(maxDoc, -1, -1);
    }

    /** Create a {@link DocIdSetBuilder} instance that is optimized for
     *  accumulating docs that match the given {@link Terms}. */
    public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException {
        this(maxDoc, terms.getDocCount(), terms.getSumDocFreq());
    }

    /** Create a {@link DocIdSetBuilder} instance that is optimized for
     *  accumulating docs that match the given {@link PointValues}. */
    public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException {
        this(maxDoc, values.getDocCount(), values.size());
    }

    DocIdSetBuilder(int maxDoc, int docCount, long valueCount) {
        this.maxDoc = maxDoc;
        this.multivalued = docCount < 0 || docCount != valueCount;
        if (docCount <= 0 || valueCount < 0) {
            // assume one value per doc, this means the cost will be overestimated
            // if the docs are actually multi-valued
            this.numValuesPerDoc = 1;
        } else {
            // otherwise compute from index stats
            this.numValuesPerDoc = (double) valueCount / docCount;
        }

        assert numValuesPerDoc >= 1 : "valueCount=" + valueCount + " docCount=" + docCount;

        // For ridiculously small sets, we'll just use a sorted int[]
        // maxDoc >>> 7 is a good value if you want to save memory, lower values
        // such as maxDoc >>> 11 should provide faster building but at the expense
        // of using a full bitset even for quite sparse data
        this.threshold = maxDoc >>> 7;

        this.bitSet = null;
    }

    /**
     * Add the content of the provided {@link DocIdSetIterator} to this builder.
     * NOTE: if you need to build a {@link DocIdSet} out of a single
     * {@link DocIdSetIterator}, you should rather use {@link RoaringDocIdSet.Builder}.
     */
    public void add(DocIdSetIterator iter) throws IOException {
        if (bitSet != null) {
            bitSet.or(iter);
            return;
        }
        int cost = (int) Math.min(Integer.MAX_VALUE, iter.cost());
        BulkAdder adder = grow(cost);
        for (int i = 0; i < cost; ++i) {
            int doc = iter.nextDoc();
            if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                return;
            }
            adder.add(doc);
        }
        for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
            grow(1).add(doc);
        }
    }

    /**
     * Reserve space and return a {@link BulkAdder} object that can be used to
     * add up to {@code numDocs} documents.
     */
    public BulkAdder grow(int numDocs) {
        if (bitSet == null) {
            if ((long) totalAllocated + numDocs <= threshold) {
                ensureBufferCapacity(numDocs);
            } else {
                upgradeToBitSet();
                counter += numDocs;
            }
        } else {
            counter += numDocs;
        }
        return adder;
    }

    private void ensureBufferCapacity(int numDocs) {
        if (buffers.isEmpty()) {
            addBuffer(additionalCapacity(numDocs));
            return;
        }

        Buffer current = buffers.get(buffers.size() - 1);
        if (current.array.length - current.length >= numDocs) {
            // current buffer is large enough
            return;
        }
        if (current.length < current.array.length - (current.array.length >>> 3)) {
            // current buffer is less than 7/8 full, resize rather than waste space
            growBuffer(current, additionalCapacity(numDocs));
        } else {
            addBuffer(additionalCapacity(numDocs));
        }
    }

    private int additionalCapacity(int numDocs) {
        // exponential growth: the new array has a size equal to the sum of what
        // has been allocated so far
        int c = totalAllocated;
        // but is also >= numDocs + 1 so that we can store the next batch of docs
        // (plus an empty slot so that we are more likely to reuse the array in build())
        c = Math.max(numDocs + 1, c);
        // avoid cold starts
        c = Math.max(32, c);
        // do not go beyond the threshold
        c = Math.min(threshold - totalAllocated, c);
        return c;
    }

    private Buffer addBuffer(int len) {
        Buffer buffer = new Buffer(len);
        buffers.add(buffer);
        adder = new BufferAdder(buffer);
        totalAllocated += buffer.array.length;
        return buffer;
    }

    private void growBuffer(Buffer buffer, int additionalCapacity) {
        buffer.array = ArrayUtil.growExact(buffer.array, buffer.array.length + additionalCapacity);
        totalAllocated += additionalCapacity;
    }

    private void upgradeToBitSet() {
        assert bitSet == null;
        FixedBitSet bitSet = new FixedBitSet(maxDoc);
        long counter = 0;
        for (Buffer buffer : buffers) {
            int[] array = buffer.array;
            int length = buffer.length;
            counter += length;
            for (int i = 0; i < length; ++i) {
                bitSet.set(array[i]);
            }
        }
        this.bitSet = bitSet;
        this.counter = counter;
        this.buffers = null;
        this.adder = new FixedBitSetAdder(bitSet);
    }

    /**
     * Build a {@link DocIdSet} from the accumulated doc IDs.
     */
    public DocIdSet build() {
        try {
            if (bitSet != null) {
                assert counter >= 0;
                final long cost = Math.round(counter / numValuesPerDoc);
                return new BitDocIdSet(bitSet, cost);
            } else {
                Buffer concatenated = concat(buffers);
                LSBRadixSorter sorter = new LSBRadixSorter();
                sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length);
                final int l;
                if (multivalued) {
                    l = dedup(concatenated.array, concatenated.length);
                } else {
                    assert noDups(concatenated.array, concatenated.length);
                    l = concatenated.length;
                }
                assert l <= concatenated.length;
                concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS;
                return new IntArrayDocIdSet(concatenated.array, l);
            }
        } finally {
            this.buffers = null;
            this.bitSet = null;
        }
    }

    /**
     * Concatenate the buffers in any order, leaving at least one empty slot in
     * the end
     * NOTE: this method might reuse one of the arrays
     */
    private static Buffer concat(List<Buffer> buffers) {
        int totalLength = 0;
        Buffer largestBuffer = null;
        for (Buffer buffer : buffers) {
            totalLength += buffer.length;
            if (largestBuffer == null || buffer.array.length > largestBuffer.array.length) {
                largestBuffer = buffer;
            }
        }
        if (largestBuffer == null) {
            return new Buffer(1);
        }
        int[] docs = largestBuffer.array;
        if (docs.length < totalLength + 1) {
            docs = ArrayUtil.growExact(docs, totalLength + 1);
        }
        totalLength = largestBuffer.length;
        for (Buffer buffer : buffers) {
            if (buffer != largestBuffer) {
                System.arraycopy(buffer.array, 0, docs, totalLength, buffer.length);
                totalLength += buffer.length;
            }
        }
        return new Buffer(docs, totalLength);
    }

    private static int dedup(int[] arr, int length) {
        if (length == 0) {
            return 0;
        }
        int l = 1;
        int previous = arr[0];
        for (int i = 1; i < length; ++i) {
            final int value = arr[i];
            assert value >= previous;
            if (value != previous) {
                arr[l++] = value;
                previous = value;
            }
        }
        return l;
    }

    private static boolean noDups(int[] a, int len) {
        for (int i = 1; i < len; ++i) {
            assert a[i - 1] < a[i];
        }
        return true;
    }

}