org.apache.lucene.util.RoaringDocIdSet.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.util.RoaringDocIdSet.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;

import java.io.IOException;

import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;

/**
 * {@link DocIdSet} implementation inspired from http://roaringbitmap.org/
 *
 * The space is divided into blocks of 2^16 bits and each block is encoded
 * independently. In each block, if less than 2^12 bits are set, then
 * documents are simply stored in a short[]. If more than 2^16-2^12 bits are
 * set, then the inverse of the set is encoded in a simple short[]. Otherwise
 * a {@link FixedBitSet} is used.
 *
 * @lucene.internal
 */
public class RoaringDocIdSet extends DocIdSet {

    // Number of documents in a block
    private static final int BLOCK_SIZE = 1 << 16;
    // The maximum length for an array, beyond that point we switch to a bitset
    private static final int MAX_ARRAY_LENGTH = 1 << 12;
    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(RoaringDocIdSet.class);

    /** A builder of {@link RoaringDocIdSet}s. */
    public static class Builder {

        private final int maxDoc;
        private final DocIdSet[] sets;

        private int cardinality;
        private int lastDocId;
        private int currentBlock;
        private int currentBlockCardinality;

        // We start by filling the buffer and when it's full we copy the content of
        // the buffer to the FixedBitSet and put further documents in that bitset
        private final short[] buffer;
        private FixedBitSet denseBuffer;

        /** Sole constructor. */
        public Builder(int maxDoc) {
            this.maxDoc = maxDoc;
            sets = new DocIdSet[(maxDoc + (1 << 16) - 1) >>> 16];
            lastDocId = -1;
            currentBlock = -1;
            buffer = new short[MAX_ARRAY_LENGTH];
        }

        private void flush() {
            assert currentBlockCardinality <= BLOCK_SIZE;
            if (currentBlockCardinality <= MAX_ARRAY_LENGTH) {
                // Use sparse encoding
                assert denseBuffer == null;
                if (currentBlockCardinality > 0) {
                    sets[currentBlock] = new ShortArrayDocIdSet(
                            ArrayUtil.copyOfSubArray(buffer, 0, currentBlockCardinality));
                }
            } else {
                assert denseBuffer != null;
                assert denseBuffer.cardinality() == currentBlockCardinality;
                if (denseBuffer.length() == BLOCK_SIZE && BLOCK_SIZE - currentBlockCardinality < MAX_ARRAY_LENGTH) {
                    // Doc ids are very dense, inverse the encoding
                    final short[] excludedDocs = new short[BLOCK_SIZE - currentBlockCardinality];
                    denseBuffer.flip(0, denseBuffer.length());
                    int excludedDoc = -1;
                    for (int i = 0; i < excludedDocs.length; ++i) {
                        excludedDoc = denseBuffer.nextSetBit(excludedDoc + 1);
                        assert excludedDoc != DocIdSetIterator.NO_MORE_DOCS;
                        excludedDocs[i] = (short) excludedDoc;
                    }
                    assert excludedDoc + 1 == denseBuffer.length()
                            || denseBuffer.nextSetBit(excludedDoc + 1) == DocIdSetIterator.NO_MORE_DOCS;
                    sets[currentBlock] = new NotDocIdSet(BLOCK_SIZE, new ShortArrayDocIdSet(excludedDocs));
                } else {
                    // Neither sparse nor super dense, use a fixed bit set
                    sets[currentBlock] = new BitDocIdSet(denseBuffer, currentBlockCardinality);
                }
                denseBuffer = null;
            }

            cardinality += currentBlockCardinality;
            denseBuffer = null;
            currentBlockCardinality = 0;
        }

        /**
         * Add a new doc-id to this builder.
         * NOTE: doc ids must be added in order.
         */
        public Builder add(int docId) {
            if (docId <= lastDocId) {
                throw new IllegalArgumentException(
                        "Doc ids must be added in-order, got " + docId + " which is <= lastDocID=" + lastDocId);
            }
            final int block = docId >>> 16;
            if (block != currentBlock) {
                // we went to a different block, let's flush what we buffered and start from fresh
                flush();
                currentBlock = block;
            }

            if (currentBlockCardinality < MAX_ARRAY_LENGTH) {
                buffer[currentBlockCardinality] = (short) docId;
            } else {
                if (denseBuffer == null) {
                    // the buffer is full, let's move to a fixed bit set
                    final int numBits = Math.min(1 << 16, maxDoc - (block << 16));
                    denseBuffer = new FixedBitSet(numBits);
                    for (short doc : buffer) {
                        denseBuffer.set(doc & 0xFFFF);
                    }
                }
                denseBuffer.set(docId & 0xFFFF);
            }

            lastDocId = docId;
            currentBlockCardinality += 1;
            return this;
        }

        /** Add the content of the provided {@link DocIdSetIterator}. */
        public Builder add(DocIdSetIterator disi) throws IOException {
            for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) {
                add(doc);
            }
            return this;
        }

        /** Build an instance. */
        public RoaringDocIdSet build() {
            flush();
            return new RoaringDocIdSet(sets, cardinality);
        }

    }

    /**
     * {@link DocIdSet} implementation that can store documents up to 2^16-1 in a short[].
     */
    private static class ShortArrayDocIdSet extends DocIdSet {

        private static final long BASE_RAM_BYTES_USED = RamUsageEstimator
                .shallowSizeOfInstance(ShortArrayDocIdSet.class);

        private final short[] docIDs;

        private ShortArrayDocIdSet(short[] docIDs) {
            this.docIDs = docIDs;
        }

        @Override
        public long ramBytesUsed() {
            return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(docIDs);
        }

        @Override
        public DocIdSetIterator iterator() throws IOException {
            return new DocIdSetIterator() {

                int i = -1; // this is the index of the current document in the array
                int doc = -1;

                private int docId(int i) {
                    return docIDs[i] & 0xFFFF;
                }

                @Override
                public int nextDoc() throws IOException {
                    if (++i >= docIDs.length) {
                        return doc = NO_MORE_DOCS;
                    }
                    return doc = docId(i);
                }

                @Override
                public int docID() {
                    return doc;
                }

                @Override
                public long cost() {
                    return docIDs.length;
                }

                @Override
                public int advance(int target) throws IOException {
                    // binary search
                    int lo = i + 1;
                    int hi = docIDs.length - 1;
                    while (lo <= hi) {
                        final int mid = (lo + hi) >>> 1;
                        final int midDoc = docId(mid);
                        if (midDoc < target) {
                            lo = mid + 1;
                        } else {
                            hi = mid - 1;
                        }
                    }
                    if (lo == docIDs.length) {
                        i = docIDs.length;
                        return doc = NO_MORE_DOCS;
                    } else {
                        i = lo;
                        return doc = docId(i);
                    }
                }
            };
        }

    }

    private final DocIdSet[] docIdSets;
    private final int cardinality;
    private final long ramBytesUsed;

    private RoaringDocIdSet(DocIdSet[] docIdSets, int cardinality) {
        this.docIdSets = docIdSets;
        long ramBytesUsed = BASE_RAM_BYTES_USED + RamUsageEstimator.shallowSizeOf(docIdSets);
        for (DocIdSet set : this.docIdSets) {
            if (set != null) {
                ramBytesUsed += set.ramBytesUsed();
            }
        }
        this.ramBytesUsed = ramBytesUsed;
        this.cardinality = cardinality;
    }

    @Override
    public long ramBytesUsed() {
        return ramBytesUsed;
    }

    @Override
    public DocIdSetIterator iterator() throws IOException {
        if (cardinality == 0) {
            return null;
        }
        return new Iterator();
    }

    private class Iterator extends DocIdSetIterator {

        int block;
        DocIdSetIterator sub = null;
        int doc;

        Iterator() throws IOException {
            doc = -1;
            block = -1;
            sub = DocIdSetIterator.empty();
        }

        @Override
        public int docID() {
            return doc;
        }

        @Override
        public int nextDoc() throws IOException {
            final int subNext = sub.nextDoc();
            if (subNext == NO_MORE_DOCS) {
                return firstDocFromNextBlock();
            }
            return doc = (block << 16) | subNext;
        }

        @Override
        public int advance(int target) throws IOException {
            final int targetBlock = target >>> 16;
            if (targetBlock != block) {
                block = targetBlock;
                if (block >= docIdSets.length) {
                    sub = null;
                    return doc = NO_MORE_DOCS;
                }
                if (docIdSets[block] == null) {
                    return firstDocFromNextBlock();
                }
                sub = docIdSets[block].iterator();
            }
            final int subNext = sub.advance(target & 0xFFFF);
            if (subNext == NO_MORE_DOCS) {
                return firstDocFromNextBlock();
            }
            return doc = (block << 16) | subNext;
        }

        private int firstDocFromNextBlock() throws IOException {
            while (true) {
                block += 1;
                if (block >= docIdSets.length) {
                    sub = null;
                    return doc = NO_MORE_DOCS;
                } else if (docIdSets[block] != null) {
                    sub = docIdSets[block].iterator();
                    final int subNext = sub.nextDoc();
                    assert subNext != NO_MORE_DOCS;
                    return doc = (block << 16) | subNext;
                }
            }
        }

        @Override
        public long cost() {
            return cardinality;
        }

    }

    /** Return the exact number of documents that are contained in this set. */
    public int cardinality() {
        return cardinality;
    }

    @Override
    public String toString() {
        return "RoaringDocIdSet(cardinality=" + cardinality + ")";
    }
}