org.apache.lucene.index.OrdinalMap.java Source code

Introduction

Here is the source code for org.apache.lucene.index.OrdinalMap.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.index;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;

/** Maps per-segment ordinals to/from global ordinal space, using a compact packed-ints representation.
 *
 *  <p><b>NOTE</b>: this is a costly operation, as it must merge sort all terms, and may require non-trivial RAM once done.  It's better to operate in
 *  segment-private ordinal space instead when possible.
 *
 * @lucene.internal */
public class OrdinalMap implements Accountable {
    // TODO: we could also have a utility method to merge Terms[] and use size() as a weight when we need it
    // TODO: use more efficient packed ints structures?

    private static class TermsEnumIndex {
        public final static TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0];
        final int subIndex;
        final TermsEnum termsEnum;
        BytesRef currentTerm;

        public TermsEnumIndex(TermsEnum termsEnum, int subIndex) {
            this.termsEnum = termsEnum;
            this.subIndex = subIndex;
        }

        public BytesRef next() throws IOException {
            currentTerm = termsEnum.next();
            return currentTerm;
        }
    }

    private static class SegmentMap implements Accountable {
        private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SegmentMap.class);

        /** Build a map from an index into a sorted view of `weights` to an index into `weights`. */
        private static int[] map(final long[] weights) {
            final int[] newToOld = new int[weights.length];
            for (int i = 0; i < weights.length; ++i) {
                newToOld[i] = i;
            }
            new InPlaceMergeSorter() {
                @Override
                protected void swap(int i, int j) {
                    final int tmp = newToOld[i];
                    newToOld[i] = newToOld[j];
                    newToOld[j] = tmp;
                }

                @Override
                protected int compare(int i, int j) {
                    // j first since we actually want higher weights first
                    return Long.compare(weights[newToOld[j]], weights[newToOld[i]]);
                }
            }.sort(0, weights.length);
            return newToOld;
        }

        /** Inverse the map. */
        private static int[] inverse(int[] map) {
            final int[] inverse = new int[map.length];
            for (int i = 0; i < map.length; ++i) {
                inverse[map[i]] = i;
            }
            return inverse;
        }

        private final int[] newToOld, oldToNew;

        SegmentMap(long[] weights) {
            newToOld = map(weights);
            oldToNew = inverse(newToOld);
            assert Arrays.equals(newToOld, inverse(oldToNew));
        }

        int newToOld(int segment) {
            return newToOld[segment];
        }

        int oldToNew(int segment) {
            return oldToNew[segment];
        }

        @Override
        public long ramBytesUsed() {
            return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(newToOld) + RamUsageEstimator.sizeOf(oldToNew);
        }
    }

    /**
     * Create an ordinal map that uses the number of unique values of each
     * {@link SortedDocValues} instance as a weight.
     * @see #build(IndexReader.CacheKey, TermsEnum[], long[], float)
     */
    public static OrdinalMap build(IndexReader.CacheKey owner, SortedDocValues[] values,
            float acceptableOverheadRatio) throws IOException {
        final TermsEnum[] subs = new TermsEnum[values.length];
        final long[] weights = new long[values.length];
        for (int i = 0; i < values.length; ++i) {
            subs[i] = values[i].termsEnum();
            weights[i] = values[i].getValueCount();
        }
        return build(owner, subs, weights, acceptableOverheadRatio);
    }

    /**
     * Create an ordinal map that uses the number of unique values of each
     * {@link SortedSetDocValues} instance as a weight.
     * @see #build(IndexReader.CacheKey, TermsEnum[], long[], float)
     */
    public static OrdinalMap build(IndexReader.CacheKey owner, SortedSetDocValues[] values,
            float acceptableOverheadRatio) throws IOException {
        final TermsEnum[] subs = new TermsEnum[values.length];
        final long[] weights = new long[values.length];
        for (int i = 0; i < values.length; ++i) {
            subs[i] = values[i].termsEnum();
            weights[i] = values[i].getValueCount();
        }
        return build(owner, subs, weights, acceptableOverheadRatio);
    }

    /** 
     * Creates an ordinal map that allows mapping ords to/from a merged
     * space from <code>subs</code>.
     * @param owner a cache key
     * @param subs TermsEnums that support {@link TermsEnum#ord()}. They need
     *             not be dense (e.g. can be FilteredTermsEnums}.
     * @param weights a weight for each sub. This is ideally correlated with
     *             the number of unique terms that each sub introduces compared
     *             to the other subs
     * @throws IOException if an I/O error occurred.
     */
    public static OrdinalMap build(IndexReader.CacheKey owner, TermsEnum subs[], long[] weights,
            float acceptableOverheadRatio) throws IOException {
        if (subs.length != weights.length) {
            throw new IllegalArgumentException("subs and weights must have the same length");
        }

        // enums are not sorted, so let's sort to save memory
        final SegmentMap segmentMap = new SegmentMap(weights);
        return new OrdinalMap(owner, subs, segmentMap, acceptableOverheadRatio);
    }

    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OrdinalMap.class);

    /** Cache key of whoever asked for this awful thing */
    public final IndexReader.CacheKey owner;
    // globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term
    final PackedLongValues globalOrdDeltas;
    // globalOrd -> first segment container
    final PackedLongValues firstSegments;
    // for every segment, segmentOrd -> globalOrd
    final LongValues segmentToGlobalOrds[];
    // the map from/to segment ids
    final SegmentMap segmentMap;
    // ram usage
    final long ramBytesUsed;

    OrdinalMap(IndexReader.CacheKey owner, TermsEnum subs[], SegmentMap segmentMap, float acceptableOverheadRatio)
            throws IOException {
        // create the ordinal mappings by pulling a termsenum over each sub's 
        // unique terms, and walking a multitermsenum over those
        this.owner = owner;
        this.segmentMap = segmentMap;
        // even though we accept an overhead ratio, we keep these ones with COMPACT
        // since they are only used to resolve values given a global ord, which is
        // slow anyway
        PackedLongValues.Builder globalOrdDeltas = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
        PackedLongValues.Builder firstSegments = PackedLongValues.packedBuilder(PackedInts.COMPACT);
        final PackedLongValues.Builder[] ordDeltas = new PackedLongValues.Builder[subs.length];
        for (int i = 0; i < ordDeltas.length; i++) {
            ordDeltas[i] = PackedLongValues.monotonicBuilder(acceptableOverheadRatio);
        }
        long[] ordDeltaBits = new long[subs.length];
        long[] segmentOrds = new long[subs.length];

        // Just merge-sorts by term:
        PriorityQueue<TermsEnumIndex> queue = new PriorityQueue<TermsEnumIndex>(subs.length) {
            @Override
            protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) {
                return a.currentTerm.compareTo(b.currentTerm) < 0;
            }
        };

        for (int i = 0; i < subs.length; i++) {
            TermsEnumIndex sub = new TermsEnumIndex(subs[segmentMap.newToOld(i)], i);
            if (sub.next() != null) {
                queue.add(sub);
            }
        }

        BytesRefBuilder scratch = new BytesRefBuilder();

        long globalOrd = 0;
        while (queue.size() != 0) {
            TermsEnumIndex top = queue.top();
            scratch.copyBytes(top.currentTerm);

            int firstSegmentIndex = Integer.MAX_VALUE;
            long globalOrdDelta = Long.MAX_VALUE;

            // Advance past this term, recording the per-segment ord deltas:
            while (true) {
                top = queue.top();
                long segmentOrd = top.termsEnum.ord();
                long delta = globalOrd - segmentOrd;
                int segmentIndex = top.subIndex;
                // We compute the least segment where the term occurs. In case the
                // first segment contains most (or better all) values, this will
                // help save significant memory
                if (segmentIndex < firstSegmentIndex) {
                    firstSegmentIndex = segmentIndex;
                    globalOrdDelta = delta;
                }
                ordDeltaBits[segmentIndex] |= delta;

                // for each per-segment ord, map it back to the global term; the while loop is needed
                // in case the incoming TermsEnums don't have compact ordinals (some ordinal values
                // are skipped), which can happen e.g. with a FilteredTermsEnum:
                assert segmentOrds[segmentIndex] <= segmentOrd;

                // TODO: we could specialize this case (the while loop is not needed when the ords
                // are compact)
                do {
                    ordDeltas[segmentIndex].add(delta);
                    segmentOrds[segmentIndex]++;
                } while (segmentOrds[segmentIndex] <= segmentOrd);

                if (top.next() == null) {
                    queue.pop();
                    if (queue.size() == 0) {
                        break;
                    }
                } else {
                    queue.updateTop();
                }
                if (queue.top().currentTerm.equals(scratch.get()) == false) {
                    break;
                }
            }

            // for each unique term, just mark the first segment index/delta where it occurs
            firstSegments.add(firstSegmentIndex);
            globalOrdDeltas.add(globalOrdDelta);
            globalOrd++;
        }

        this.firstSegments = firstSegments.build();
        this.globalOrdDeltas = globalOrdDeltas.build();
        // ordDeltas is typically the bottleneck, so let's see what we can do to make it faster
        segmentToGlobalOrds = new LongValues[subs.length];
        long ramBytesUsed = BASE_RAM_BYTES_USED + this.globalOrdDeltas.ramBytesUsed()
                + this.firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds)
                + segmentMap.ramBytesUsed();
        for (int i = 0; i < ordDeltas.length; ++i) {
            final PackedLongValues deltas = ordDeltas[i].build();
            if (ordDeltaBits[i] == 0L) {
                // segment ords perfectly match global ordinals
                // likely in case of low cardinalities and large segments
                segmentToGlobalOrds[i] = LongValues.IDENTITY;
            } else {
                final int bitsRequired = ordDeltaBits[i] < 0 ? 64 : PackedInts.bitsRequired(ordDeltaBits[i]);
                final long monotonicBits = deltas.ramBytesUsed() * 8;
                final long packedBits = bitsRequired * deltas.size();
                if (deltas.size() <= Integer.MAX_VALUE
                        && packedBits <= monotonicBits * (1 + acceptableOverheadRatio)) {
                    // monotonic compression mostly adds overhead, let's keep the mapping in plain packed ints
                    final int size = (int) deltas.size();
                    final PackedInts.Mutable newDeltas = PackedInts.getMutable(size, bitsRequired,
                            acceptableOverheadRatio);
                    final PackedLongValues.Iterator it = deltas.iterator();
                    for (int ord = 0; ord < size; ++ord) {
                        newDeltas.set(ord, it.next());
                    }
                    assert it.hasNext() == false;
                    segmentToGlobalOrds[i] = new LongValues() {
                        @Override
                        public long get(long ord) {
                            return ord + newDeltas.get((int) ord);
                        }
                    };
                    ramBytesUsed += newDeltas.ramBytesUsed();
                } else {
                    segmentToGlobalOrds[i] = new LongValues() {
                        @Override
                        public long get(long ord) {
                            return ord + deltas.get(ord);
                        }
                    };
                    ramBytesUsed += deltas.ramBytesUsed();
                }
                ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds[i]);
            }
        }
        this.ramBytesUsed = ramBytesUsed;
    }

    /** 
     * Given a segment number, return a {@link LongValues} instance that maps
     * segment ordinals to global ordinals.
     */
    public LongValues getGlobalOrds(int segmentIndex) {
        return segmentToGlobalOrds[segmentMap.oldToNew(segmentIndex)];
    }

    /**
     * Given global ordinal, returns the ordinal of the first segment which contains
     * this ordinal (the corresponding to the segment return {@link #getFirstSegmentNumber}).
     */
    public long getFirstSegmentOrd(long globalOrd) {
        return globalOrd - globalOrdDeltas.get(globalOrd);
    }

    /** 
     * Given a global ordinal, returns the index of the first
     * segment that contains this term.
     */
    public int getFirstSegmentNumber(long globalOrd) {
        return segmentMap.newToOld((int) firstSegments.get(globalOrd));
    }

    /**
     * Returns the total number of unique terms in global ord space.
     */
    public long getValueCount() {
        return globalOrdDeltas.size();
    }

    @Override
    public long ramBytesUsed() {
        return ramBytesUsed;
    }

    @Override
    public Collection<Accountable> getChildResources() {
        List<Accountable> resources = new ArrayList<>();
        resources.add(Accountables.namedAccountable("global ord deltas", globalOrdDeltas));
        resources.add(Accountables.namedAccountable("first segments", firstSegments));
        resources.add(Accountables.namedAccountable("segment map", segmentMap));
        // TODO: would be nice to return actual child segment deltas too, but the optimizations are confusing
        return resources;
    }
}