org.opengrok.suggest.query.customized.CustomSloppyPhraseScorer.java Source code

Introduction

Here is the source code for org.opengrok.suggest.query.customized.CustomSloppyPhraseScorer.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.opengrok.suggest.query.customized;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConjunctionDISI;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.FixedBitSet;
import org.opengrok.suggest.query.PhraseScorer;
import org.opengrok.suggest.query.data.BitIntsHolder;
import org.opengrok.suggest.query.data.IntsHolder;

/**
 * Modified Apache Lucene's SloppyPhraseScorer (now {@link org.apache.lucene.search.SloppyPhraseMatcher}) to remember
 * the positions of the matches.
 */
final class CustomSloppyPhraseScorer extends Scorer implements PhraseScorer { // custom  specific interface

    private final DocIdSetIterator conjunction;
    private final PhrasePositions[] phrasePositions;

    private float sloppyFreq; //phrase frequency in current doc as computed by phraseFreq().

    private final int slop;
    private final int numPostings;
    private final PhraseQueue pq; // for advancing min position

    private int end; // current largest phrase position

    private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
    private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
    private boolean hasMultiTermRpts; //
    private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset
    private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps

    // custom begins
    private int offset;

    private Map<Integer, IntsHolder> documentsToPositionsMap = new HashMap<>();
    // custom ends

    // custom  constructor parameters
    /**
     * Creates custom sloppy phrase scorer which remembers the positions of the found matches.
     * @param weight query weight
     * @param postings postings of the terms
     * @param slop "word edit distance"
     * @param offset the offset that is added to the found match position
     */
    CustomSloppyPhraseScorer(final Weight weight, final CustomPhraseQuery.PostingsAndFreq[] postings,
            final int slop, final int offset) {
        super(weight);
        this.slop = slop;
        this.offset = offset; // custom
        this.numPostings = postings == null ? 0 : postings.length;
        pq = new PhraseQueue(postings.length);
        DocIdSetIterator[] iterators = new DocIdSetIterator[postings.length];
        phrasePositions = new PhrasePositions[postings.length];
        for (int i = 0; i < postings.length; ++i) {
            iterators[i] = postings[i].postings;
            phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i,
                    postings[i].terms);
        }
        // custom begins  support for single term
        if (iterators.length == 1) {
            conjunction = iterators[0];
        } else {
            conjunction = ConjunctionDISI.intersectIterators(Arrays.asList(iterators));
        }
        // custom ends
        assert TwoPhaseIterator.unwrap(conjunction) == null;
    }

    /**
     * Score a candidate doc for all slop-valid position-combinations (matches)
     * encountered while traversing/hopping the PhrasePositions.
     * <br> The score contribution of a match depends on the distance:
     * <br> - highest score for distance=0 (exact match).
     * <br> - score gets lower as distance gets higher.
     * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
     * once for "a b" (distance=0), and once for "b a" (distance=2).
     * <br>Possibly not all valid combinations are encountered, because for efficiency
     * we always propagate the least PhrasePosition. This allows to base on
     * PriorityQueue and move forward faster.
     * As result, for example, document "a b c b a"
     * would score differently for queries "a b c"~4 and "c b a"~4, although
     * they really are equivalent.
     * Similarly, for doc "a b c b a f g", query "c b"~2
     * would get same score as "g f"~2, although "c b"~2 could be matched twice.
     * We may want to fix this in the future (currently not, for performance reasons).
     */
    private float phraseFreq() throws IOException {
        // custom begins
        BitIntsHolder allPositions = new BitIntsHolder();

        BitIntsHolder positions = new BitIntsHolder();

        if (phrasePositions.length == 1) { // special handling for one term
            end = Integer.MIN_VALUE;
            PhrasePositions pp = phrasePositions[0];
            pp.firstPosition();
            if (pp.position > end) {
                end = pp.position;
            }
            int matchCount = 0;
            while (advancePP(pp)) {
                allPositions.set(pp.position + pp.offset);
                addPositions(positions, allPositions, pp.position + pp.offset, 0);
                matchCount++;
            }
            if (!positions.isEmpty()) {
                documentsToPositionsMap.put(docID(), positions);
            }
            return matchCount;
        }
        // custom ends

        if (!initPhrasePositions()) {
            return 0.0f;
        }

        // custom begins
        for (PhrasePositions phrasePositions : this.pq) {
            allPositions.set(phrasePositions.position + phrasePositions.offset);
        }
        // custom ends

        int numMatches = 0;
        PhrasePositions pp = pq.pop();
        int matchLength = end - pp.position;
        int next = pq.top().position;

        int lastEnd = this.end; // custom  remember last matched position

        while (advancePP(pp)) {

            if (hasRpts && !advanceRpts(pp)) {
                break; // pps exhausted
            }

            allPositions.set(pp.position + pp.offset);

            if (pp.position > next) { // done minimizing current match-length
                if (matchLength <= slop) {
                    numMatches++;
                    // custom  match found, remember positions
                    addPositions(positions, allPositions, lastEnd, matchLength);
                }
                pq.add(pp);
                pp = pq.pop();
                next = pq.top().position;
                matchLength = end - pp.position;

                lastEnd = this.end; // custom  remember position of last match

            } else {
                int matchLength2 = end - pp.position;
                if (matchLength2 < matchLength) {
                    matchLength = matchLength2;
                }

                lastEnd = this.end; // custom  remember position of last match
            }
        }
        if (matchLength <= slop) {
            numMatches++;
            addPositions(positions, allPositions, lastEnd, matchLength); // custom  match found, remember positions
        }
        // custom begins  if some positions were found then store them
        if (!positions.isEmpty()) {
            documentsToPositionsMap.put(docID(), positions);
        }
        // custom ends
        return numMatches;
    }

    // custom begins
    /**
     * Stores all the possible positions.
     * @param positions where to store the positions
     * @param allPositions positions already taken by the terms
     * @param lastEnd match position
     * @param matchLength how many words from "edit distance" was already taken to find this match
     */
    private void addPositions(final BitIntsHolder positions, final IntsHolder allPositions, final int lastEnd,
            final int matchLength) {
        int expectedPos = lastEnd + offset;

        int range = this.slop - matchLength;
        for (int i = 0; i < (2 * range) + 1; i++) {
            int pos = expectedPos + i - range;
            if (pos > 0 && !allPositions.has(pos)) {
                positions.set(pos);
            }
        }
    }
    // custom ends

    /** advance a PhrasePosition and update 'end', return false if exhausted */
    private boolean advancePP(PhrasePositions pp) throws IOException {
        if (!pp.nextPosition()) {
            return false;
        }
        if (pp.position > end) {
            end = pp.position;
        }
        return true;
    }

    /** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser
     * of the two colliding pps. Note that there can only be one collision, as by the initialization
     * there were no collisions before pp was advanced.  */
    private boolean advanceRpts(PhrasePositions pp) throws IOException {
        if (pp.rptGroup < 0) {
            return true; // not a repeater
        }
        PhrasePositions[] rg = rptGroups[pp.rptGroup];
        FixedBitSet bits = new FixedBitSet(rg.length); // for re-queuing after collisions are resolved
        int k0 = pp.rptInd;
        int k;
        while ((k = collide(pp)) >= 0) {
            pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps
            if (!advancePP(pp)) {
                return false; // exhausted
            }
            if (k != k0) { // careful: mark only those currently in the queue
                bits = FixedBitSet.ensureCapacity(bits, k);
                bits.set(k); // mark that pp2 need to be re-queued
            }
        }
        // collisions resolved, now re-queue
        // empty (partially) the queue until seeing all pps advanced for resolving collisions
        int n = 0;
        // TODO would be good if we can avoid calling cardinality() in each iteration!
        int numBits = bits.length(); // larges bit we set
        while (bits.cardinality() > 0) {
            PhrasePositions pp2 = pq.pop();
            rptStack[n++] = pp2;
            if (pp2.rptGroup >= 0 && pp2.rptInd < numBits // this bit may not have been set
                    && bits.get(pp2.rptInd)) {
                bits.clear(pp2.rptInd);
            }
        }
        // add back to queue
        for (int i = n - 1; i >= 0; i--) {
            pq.add(rptStack[i]);
        }
        return true;
    }

    /** compare two pps, but only by position and offset */
    private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) {
        if (pp.position < pp2.position || (pp.position == pp2.position && pp.offset < pp2.offset)) {
            return pp;
        }
        return pp2;
    }

    /** index of a pp2 colliding with pp, or -1 if none */
    private int collide(PhrasePositions pp) {
        int tpPos = tpPos(pp);
        PhrasePositions[] rg = rptGroups[pp.rptGroup];
        for (int i = 0; i < rg.length; i++) {
            PhrasePositions pp2 = rg[i];
            if (pp2 != pp && tpPos(pp2) == tpPos) {
                return pp2.rptInd;
            }
        }
        return -1;
    }

    /**
     * Initialize PhrasePositions in place.
     * A one time initialization for this scorer (on first doc matching all terms):
     * <ul>
     *  <li>Check if there are repetitions
     *  <li>If there are, find groups of repetitions.
     * </ul>
     * Examples:
     * <ol>
     *  <li>no repetitions: <b>"ho my"~2</b>
     *  <li>repetitions: <b>"ho my my"~2</b>
     *  <li>repetitions: <b>"my ho my"~2</b>
     * </ol>
     * @return false if PPs are exhausted (and so current doc will not be a match)
     */
    private boolean initPhrasePositions() throws IOException {
        end = Integer.MIN_VALUE;
        if (!checkedRpts) {
            return initFirstTime();
        }
        if (!hasRpts) {
            initSimple();
            return true; // PPs available
        }
        return initComplex();
    }

    /** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */
    private void initSimple() throws IOException {
        pq.clear();
        // position pps and build queue from list
        for (PhrasePositions pp : phrasePositions) {
            pp.firstPosition();
            if (pp.position > end) {
                end = pp.position;
            }
            pq.add(pp);
        }
    }

    /** with repeats: not so simple. */
    private boolean initComplex() throws IOException {
        placeFirstPositions();
        if (!advanceRepeatGroups()) {
            return false; // PPs exhausted
        }
        fillQueue();
        return true; // PPs available
    }

    /** move all PPs to their first position */
    private void placeFirstPositions() throws IOException {
        for (PhrasePositions pp : phrasePositions) {
            pp.firstPosition();
        }
    }

    /** Fill the queue (all pps are already placed */
    private void fillQueue() {
        pq.clear();
        for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max
            if (pp.position > end) {
                end = pp.position;
            }
            pq.add(pp);
        }
    }

    /** At initialization (each doc), each repetition group is sorted by (query) offset.
     * This provides the start condition: no collisions.
     * <p>Case 1: no multi-term repeats<br>
     * It is sufficient to advance each pp in the group by one less than its group index.
     * So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
     * <p>Case 2: multi-term repeats<br>
     *
     * @return false if PPs are exhausted.
     */
    private boolean advanceRepeatGroups() throws IOException {
        for (PhrasePositions[] rg : rptGroups) {
            if (hasMultiTermRpts) {
                // more involved, some may not collide
                int incr;
                for (int i = 0; i < rg.length; i += incr) {
                    incr = 1;
                    PhrasePositions pp = rg[i];
                    int k;
                    while ((k = collide(pp)) >= 0) {
                        PhrasePositions pp2 = lesser(pp, rg[k]);
                        if (!advancePP(pp2)) { // at initialization always advance pp with higher offset
                            return false; // exhausted
                        }
                        if (pp2.rptInd < i) { // should not happen?
                            incr = 0;
                            break;
                        }
                    }
                }
            } else {
                // simpler, we know exactly how much to advance
                for (int j = 1; j < rg.length; j++) {
                    for (int k = 0; k < j; k++) {
                        if (!rg[j].nextPosition()) {
                            return false; // PPs exhausted
                        }
                    }
                }
            }
        }
        return true; // PPs available
    }

    /** initialize with checking for repeats. Heavy work, but done only for the first candidate doc.<p>
     * If there are repetitions, check if multi-term postings (MTP) are involved.<p>
     * Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.<br>
     * With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".<br>
     * For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point
     * to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.<p>
     * The more complex initialization has two parts:<br>
     * (1) identification of repetition groups.<br>
     * (2) advancing repeat groups at the start of the doc.<br>
     * For (1), a possible solution is to just create a single repetition group,
     * made of all repeating pps. But this would slow down the check for collisions,
     * as all pps would need to be checked. Instead, we compute "connected regions"
     * on the bipartite graph of postings and terms.
     */
    private boolean initFirstTime() throws IOException {
        checkedRpts = true;
        placeFirstPositions();

        LinkedHashMap<Term, Integer> rptTerms = repeatingTerms();
        hasRpts = !rptTerms.isEmpty();

        if (hasRpts) {
            rptStack = new PhrasePositions[numPostings]; // needed with repetitions
            ArrayList<ArrayList<PhrasePositions>> rgs = gatherRptGroups(rptTerms);
            sortRptGroups(rgs);
            if (!advanceRepeatGroups()) {
                return false; // PPs exhausted
            }
        }

        fillQueue();
        return true; // PPs available
    }

    /** sort each repetition group by (query) offset.
     * Done only once (at first doc) and allows to initialize faster for each doc. */
    private void sortRptGroups(ArrayList<ArrayList<PhrasePositions>> rgs) {
        rptGroups = new PhrasePositions[rgs.size()][];
        Comparator<PhrasePositions> cmprtr = new Comparator<PhrasePositions>() {
            @Override
            public int compare(PhrasePositions pp1, PhrasePositions pp2) {
                return pp1.offset - pp2.offset;
            }
        };
        for (int i = 0; i < rptGroups.length; i++) {
            PhrasePositions[] rg = rgs.get(i).toArray(new PhrasePositions[0]);
            Arrays.sort(rg, cmprtr);
            rptGroups[i] = rg;
            for (int j = 0; j < rg.length; j++) {
                rg[j].rptInd = j; // we use this index for efficient re-queuing
            }
        }
    }

    /** Detect repetition groups. Done once - for first doc */
    private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term, Integer> rptTerms)
            throws IOException {
        PhrasePositions[] rpp = repeatingPPs(rptTerms);
        ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<>();
        if (!hasMultiTermRpts) {
            // simpler - no multi-terms - can base on positions in first doc
            for (int i = 0; i < rpp.length; i++) {
                PhrasePositions pp = rpp[i];
                if (pp.rptGroup >= 0) { // custom  add braces because of checkstyle
                    continue; // already marked as a repetition
                }
                int tpPos = tpPos(pp);
                for (int j = i + 1; j < rpp.length; j++) {
                    PhrasePositions pp2 = rpp[j];
                    if (pp2.rptGroup >= 0 // already marked as a repetition
                            || pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query!
                            || tpPos(pp2) != tpPos) { // not a repetition
                        continue;
                    }
                    // a repetition
                    int g = pp.rptGroup;
                    if (g < 0) {
                        g = res.size();
                        pp.rptGroup = g;
                        ArrayList<PhrasePositions> rl = new ArrayList<>(2);
                        rl.add(pp);
                        res.add(rl);
                    }
                    pp2.rptGroup = g;
                    res.get(g).add(pp2);
                }
            }
        } else {
            // more involved - has multi-terms
            ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<>();
            ArrayList<FixedBitSet> bb = ppTermsBitSets(rpp, rptTerms);
            unionTermGroups(bb);
            HashMap<Term, Integer> tg = termGroups(rptTerms, bb);
            HashSet<Integer> distinctGroupIDs = new HashSet<>(tg.values());
            for (int i = 0; i < distinctGroupIDs.size(); i++) {
                tmp.add(new HashSet<>());
            }
            for (PhrasePositions pp : rpp) {
                for (Term t : pp.terms) {
                    if (rptTerms.containsKey(t)) {
                        int g = tg.get(t);
                        tmp.get(g).add(pp);
                        assert pp.rptGroup == -1 || pp.rptGroup == g;
                        pp.rptGroup = g;
                    }
                }
            }
            for (HashSet<PhrasePositions> hs : tmp) {
                res.add(new ArrayList<>(hs));
            }
        }
        return res;
    }

    /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
    private int tpPos(PhrasePositions pp) {
        return pp.position + pp.offset;
    }

    /** find repeating terms and assign them ordinal values */
    private LinkedHashMap<Term, Integer> repeatingTerms() {
        LinkedHashMap<Term, Integer> tord = new LinkedHashMap<>();
        HashMap<Term, Integer> tcnt = new HashMap<>();
        for (PhrasePositions pp : phrasePositions) {
            for (Term t : pp.terms) {
                Integer cnt0 = tcnt.get(t);
                Integer cnt = cnt0 == null ? Integer.valueOf(1) : Integer.valueOf(1 + cnt0.intValue());
                tcnt.put(t, cnt);
                if (cnt == 2) {
                    tord.put(t, tord.size());
                }
            }
        }
        return tord;
    }

    /** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */
    private PhrasePositions[] repeatingPPs(HashMap<Term, Integer> rptTerms) {
        ArrayList<PhrasePositions> rp = new ArrayList<>();
        for (PhrasePositions pp : phrasePositions) {
            for (Term t : pp.terms) {
                if (rptTerms.containsKey(t)) {
                    rp.add(pp);
                    hasMultiTermRpts |= (pp.terms.length > 1);
                    break;
                }
            }
        }
        return rp.toArray(new PhrasePositions[0]);
    }

    /** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */
    private ArrayList<FixedBitSet> ppTermsBitSets(PhrasePositions[] rpp, HashMap<Term, Integer> tord) {
        ArrayList<FixedBitSet> bb = new ArrayList<>(rpp.length);
        for (PhrasePositions pp : rpp) {
            FixedBitSet b = new FixedBitSet(tord.size());
            Integer ord;
            for (Term t : pp.terms) {
                if ((ord = tord.get(t)) != null) {
                    b.set(ord);
                }
            }
            bb.add(b);
        }
        return bb;
    }

    /** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */
    private void unionTermGroups(ArrayList<FixedBitSet> bb) {
        int incr;
        for (int i = 0; i < bb.size() - 1; i += incr) {
            incr = 1;
            int j = i + 1;
            while (j < bb.size()) {
                if (bb.get(i).intersects(bb.get(j))) {
                    bb.get(i).or(bb.get(j));
                    bb.remove(j);
                    incr = 0;
                } else {
                    ++j;
                }
            }
        }
    }

    /** map each term to the single group that contains it */
    private HashMap<Term, Integer> termGroups(LinkedHashMap<Term, Integer> tord, ArrayList<FixedBitSet> bb)
            throws IOException {
        HashMap<Term, Integer> tg = new HashMap<>();
        Term[] t = tord.keySet().toArray(new Term[0]);
        for (int i = 0; i < bb.size(); i++) { // i is the group no.
            FixedBitSet bits = bb.get(i);
            for (int ord = bits.nextSetBit(0); ord != DocIdSetIterator.NO_MORE_DOCS; ord = ord + 1 >= bits.length()
                    ? DocIdSetIterator.NO_MORE_DOCS
                    : bits.nextSetBit(ord + 1)) {
                tg.put(t[ord], i);
            }
        }
        return tg;
    }

    @Override
    public int docID() {
        return conjunction.docID();
    }

    @Override
    public float score() {
        return 1; // custom  default value
    }

    @Override
    public String toString() {
        return "CustomSloppyPhraseScorer(" + weight + ")"; // custom  renamed class
    }

    @Override
    public TwoPhaseIterator twoPhaseIterator() {
        return new TwoPhaseIterator(conjunction) {
            @Override
            public boolean matches() throws IOException {
                // custom  interrupt handler
                if (Thread.currentThread().isInterrupted()) {
                    throw new IOException("Interrupted while scoring documents");
                }
                sloppyFreq = phraseFreq(); // check for phrase
                return sloppyFreq != 0F;
            }

            @Override
            public float matchCost() {
                return 0; // custom  default value
            }

            @Override
            public String toString() {
                return "CustomSloppyPhraseScorer@asTwoPhaseIterator(" + CustomSloppyPhraseScorer.this + ")";
            }
        };
    }

    @Override
    public DocIdSetIterator iterator() {
        return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
    }

    // custom begins  special interface implementation
    /** {@inheritDoc} */
    @Override
    public IntsHolder getPositions(int docId) {
        return documentsToPositionsMap.get(docId);
    }
    // custom ends
}