Source code

Java tutorial


Here is the source code for


 * Copyright (c) 2010, 2012 Institute for Dutch Lexicology
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.eclipse.collections.api.tuple.primitive.IntIntPair;
import org.eclipse.collections.impl.factory.primitive.IntIntMaps;
import org.eclipse.collections.impl.factory.primitive.IntObjectMaps;

import nl.inl.blacklab.forwardindex.ForwardIndex;
import nl.inl.blacklab.forwardindex.Terms;
import nl.inl.blacklab.highlight.XmlHighlighter;
import nl.inl.blacklab.index.complex.ComplexFieldUtil;
import nl.inl.util.ThreadPriority;

 * Represents a list of Hit objects. Also maintains information about the context (concordance)
 * information stored in the Hit objects.
public class HitsImpl extends Hits {

    protected static final Logger logger = LogManager.getLogger(HitsImpl.class);

     * The hits.
    protected List<Hit> hits;

     * The captured groups, if we have any.
    protected Map<Hit, Span[]> capturedGroups;

     * The hit contexts.
     * There may be multiple contexts for each hit (see contextFieldsPropName).
     * Each int array starts with three bookkeeping integers, followed by the contexts information.
     * The bookkeeping integers are:
     * * 0 = hit start, index of the hit word (and length of the left context), counted from the start the context
     * * 1 = right start, start of the right context, counted from the start the context
     * * 2 = context length, length of 1 context. As stated above, there may be multiple contexts.
     * The first context therefore starts at index 3.
    private int[][] contexts;

     * The current context size (number of words around hits we now have).
    private int currentContextSize;

     * The sort order, if we've sorted, or null if not
    Integer[] sortOrder;

     * The KWIC data, if it has been retrieved.
     * NOTE: this will always be null if not all the hits have been retrieved.
    protected Map<Hit, Kwic> kwics;

     * The concordances, if they have been retrieved.
     * NOTE: when making concordances from the forward index, this will
     * always be null, because Kwics will be used internally. This is
     * only used when making concordances from the content store (the old
     * default).
    Map<Hit, Concordance> concordances;

     * If we have context information, this specifies the property (i.e. word, lemma, pos) the context came from.
     * Otherwise, it is null.
    protected List<String> contextFieldsPropName;

     * Our SpanQuery.
    protected BLSpanQuery spanQuery;

    /** The SpanWeight for our SpanQuery, from which we can get the next Spans when the current one's done. */
    private SpanWeight weight;

     * The LeafReaderContexts we should query in succession.
    protected List<LeafReaderContext> atomicReaderContexts;

     * What LeafReaderContext we're querying now.
    protected int atomicReaderContextIndex = -1;

     * Term contexts for the terms in the query.
    private Map<Term, TermContext> termContexts;

     * docBase of the segment we're currently in
    protected int currentDocBase;

     * Our Spans object, which may not have been fully read yet.
    protected BLSpans currentSourceSpans;

     * Did we completely read our Spans object?
    protected boolean sourceSpansFullyRead = true;

     * If true, we've stopped retrieving hits because there are more than
     * the maximum we've set.
    private boolean maxHitsRetrieved = false;

     * If true, we've stopped counting hits because there are more than
     * the maximum we've set.
    private boolean maxHitsCounted = false;

     * The number of hits we've seen and counted so far. May be more than
     * the number of hits we've retrieved if that exceeds maxHitsToRetrieve.
    protected int hitsCounted = 0;

     * The number of separate documents we've seen in the hits retrieved.
    protected int docsRetrieved = 0;

     * The number of separate documents we've counted so far (includes non-retrieved hits).
    protected int docsCounted = 0;

     * Document the previous hit was in, so we can count separate documents.
    protected int previousHitDoc = -1;

    public Hits copy() {
        return new HitsImpl(this);

     * Construct a Hits object from an existing Hits object.
     * The same hits list is reused. Context and sort order are
     * not copied. All other fields are.
     * @param copyFrom the Hits object to copy
    private HitsImpl(HitsImpl copyFrom) {
        try {
        } catch (InterruptedException e) {
            // (should be detected by the client)
        hits = copyFrom.hits;
        kwics = copyFrom.kwics;
        sourceSpansFullyRead = true;
        hitsCounted = copyFrom.countSoFarHitsCounted();
        docsRetrieved = copyFrom.countSoFarDocsRetrieved();
        docsCounted = copyFrom.countSoFarDocsCounted();
        previousHitDoc = copyFrom.previousHitDoc;


        currentContextSize = -1; // context is not copied
        etiquette = new ThreadPriority();

     * Make a wrapper Hits object for a list of Hit objects.
     * Does not copy the list, but reuses it.
     * @param searcher
     *            the searcher object
     * @param concordanceFieldName
     *            field to use by default when finding concordances
     * @param hits the list of hits to wrap
    HitsImpl(Searcher searcher, List<Hit> hits) {
        this.hits = hits == null ? new ArrayList<Hit>() : hits;
        hitsCounted = this.hits.size();
        currentContextSize = -1;
        int prevDoc = -1;
        docsRetrieved = docsCounted = 0;
        for (Hit h : this.hits) {
            if (h.doc != prevDoc) {
                prevDoc = h.doc;
        etiquette = new ThreadPriority();

     * Make a wrapper Hits object for a list of Hit objects.
     * Does not copy the list, but reuses it.
     * @param searcher
     *            the searcher object
     * @param concordanceFieldName
     *            field to use by default when finding concordances
     * @param hits the list of hits to wrap
     * @deprecated if you need a different concordance field, set it manually
    HitsImpl(Searcher searcher, String concordanceFieldName, List<Hit> hits) {
        this(searcher, hits);

     * Construct a Hits object from a SpanQuery.
     * @param searcher
     *            the searcher object
     * @param concordanceFieldPropName
     *            field to use by default when finding concordances
     * @param sourceQuery
     *            the query to execute to get the hits
     * @throws TooManyClauses if the query is overly broad (expands to too many terms)
    HitsImpl(Searcher searcher, SpanQuery sourceQuery) throws TooManyClauses {
        this(searcher, (List<Hit>) null);
        try {
            IndexReader reader = searcher.getIndexReader();
            if (!(sourceQuery instanceof BLSpanQuery))
                throw new IllegalArgumentException("Supplied query must be a BLSpanQuery!");

            if (Searcher.traceQueryExecution)
                logger.debug("HitsImpl(): optimize");
            BLSpanQuery optimize = ((BLSpanQuery) sourceQuery).optimize(reader);

            if (Searcher.traceQueryExecution)
                logger.debug("HitsImpl(): rewrite");
            spanQuery = optimize.rewrite(reader);

            termContexts = new HashMap<>();
            Set<Term> terms = new HashSet<>();
            spanQuery = BLSpanQuery.ensureSortedUnique(spanQuery);
            if (Searcher.traceQueryExecution)
                logger.debug("HitsImpl(): createWeight");
            weight = spanQuery.createWeight(searcher.getIndexSearcher(), false);
            etiquette = new ThreadPriority();
            if (Searcher.traceQueryExecution)
                logger.debug("HitsImpl(): extract terms");
            for (Term term : terms) {
                try {
                } catch (InterruptedException e) {
                    // Taking too long, break it off.
                    // Not a very graceful way to do it... but at least it won't
                    // be stuck forever.
                    Thread.currentThread().interrupt(); // client can check this
                    throw new RuntimeException("Query matches too many terms; aborted.");
                termContexts.put(term,, term));

            currentSourceSpans = null;
            atomicReaderContexts = reader == null ? null : reader.leaves();
            atomicReaderContextIndex = -1;
        } catch (IOException e) {
            throw new RuntimeException(e);

        sourceSpansFullyRead = false;
        if (Searcher.traceQueryExecution)
            logger.debug("HitsImpl(): done");

     * Construct a Hits object from a SpanQuery.
     * @param searcher
     *            the searcher object
     * @param concordanceFieldPropName
     *            field to use by default when finding concordances
     * @param sourceQuery
     *            the query to execute to get the hits
     * @throws TooManyClauses if the query is overly broad (expands to too many terms)
     * @deprecated if you need a different concordance field, set it manually
    HitsImpl(Searcher searcher, String concordanceFieldPropName, SpanQuery sourceQuery) throws TooManyClauses {
        this(searcher, sourceQuery);

     * Construct a Hits object from a Spans.
     * If possible, don't use this constructor, use the one that takes
     * a SpanQuery, as it's more efficient.
     * Note that the Spans provided must be start-point sorted and contain unique hits.
     * @param searcher
     *            the searcher object
     * @param source
     *            where to retrieve the Hit objects from
    HitsImpl(Searcher searcher, BLSpans source) {
        this(searcher, (List<Hit>) null);

        currentSourceSpans = source;
        try {
            sourceSpansFullyRead = currentSourceSpans.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
        } catch (IOException e) {
            throw new RuntimeException(e);

     * Construct a Hits object from a Spans.
     * If possible, don't use this constructor, use the one that takes
     * a SpanQuery, as it's more efficient.
     * @param searcher
     *            the searcher object
     * @param concordanceFieldPropName
     *            field to use by default when finding concordances
     * @param source
     *            where to retrieve the Hit objects from
     * @deprecated if you need a different concordance field, set it manually
    HitsImpl(Searcher searcher, String concordanceFieldPropName, BLSpans source) {
        this(searcher, source);

    /** Sets the desired context size.
     * @param contextSize the context size (number of words to fetch around hits)
     * @deprecated use settings().setContextSize()
    public synchronized void setContextSize(int contextSize) {

     * Did we stop retrieving hits because we reached the maximum?
     * @return true if we reached the maximum and stopped retrieving hits
    public boolean maxHitsRetrieved() {
        return maxHitsRetrieved;

     * Did we stop counting hits because we reached the maximum?
     * @return true if we reached the maximum and stopped counting hits
    public boolean maxHitsCounted() {
        return maxHitsCounted;

    private void ensureAllHitsRead() throws InterruptedException {

     * Ensure that we have read at least as many hits as specified in the parameter.
     * @param number the minimum number of hits that will have been read when this method
     *   returns (unless there are fewer hits than this); if negative, reads all hits
     * @throws InterruptedException if the thread was interrupted during this operation
    void ensureHitsRead(int number) throws InterruptedException {
        if (sourceSpansFullyRead || (number >= 0 && hits.size() >= number))

        synchronized (this) {
            boolean readAllHits = number < 0;
            try {
                int maxHitsToCount = settings.maxHitsToCount();
                int maxHitsToRetrieve = settings.maxHitsToRetrieve();
                while (readAllHits || hits.size() < number) {

                    // Don't hog the CPU, don't take too long

                    // Stop if we're at the maximum number of hits we want to count
                    if (maxHitsToCount >= 0 && hitsCounted >= maxHitsToCount) {
                        maxHitsCounted = true;

                    // Get the next hit from the spans, moving to the next
                    // segment when necessary.
                    while (true) {
                        while (currentSourceSpans == null) {
                            // Exhausted (or not started yet); get next segment spans.

                            if (spanQuery == null) {
                                // We started from a Spans, not a SpanQuery. We're done now.
                                // (only used in deprecated methods or while testing)

                            if (atomicReaderContexts != null
                                    && atomicReaderContextIndex >= atomicReaderContexts.size()) {
                                sourceSpansFullyRead = true;
                            if (atomicReaderContexts != null) {
                                // Get the atomic reader context and get the next Spans from it.
                                LeafReaderContext context = atomicReaderContexts.get(atomicReaderContextIndex);
                                currentDocBase = context.docBase;
                                BLSpans spans = (BLSpans) weight.getSpans(context, Postings.OFFSETS);
                                currentSourceSpans = spans; //BLSpansWrapper.optWrapSortUniq(spans);
                            } else {
                                // TESTING
                                currentDocBase = 0;
                                if (atomicReaderContextIndex > 0) {
                                    sourceSpansFullyRead = true;
                                BLSpans spans = (BLSpans) weight.getSpans(null, Postings.OFFSETS);
                                currentSourceSpans = spans; //BLSpansWrapper.optWrapSortUniq(spans);

                            if (currentSourceSpans != null) {
                                // Update the hit query context with our new spans,
                                // and notify the spans of the hit query context
                                // (TODO: figure out if we need to call setHitQueryContext()
                                //    for each segment or not; if it's just about capture groups
                                //    registering themselves, we only need that for the first Spans.
                                //    But it's probably required for backreferences, etc. anyway,
                                //    and there won't be that many segments, so it's probably ok)
                                currentSourceSpans.setHitQueryContext(hitQueryContext); // let captured groups register themselves
                                if (capturedGroups == null && hitQueryContext.numberOfCapturedGroups() > 0) {
                                    capturedGroups = new HashMap<>();

                                int doc = currentSourceSpans.nextDoc();
                                if (doc == DocIdSetIterator.NO_MORE_DOCS)
                                    currentSourceSpans = null; // no matching docs in this segment, try next

                        // Advance to next hit
                        int start = currentSourceSpans.nextStartPosition();
                        if (start == Spans.NO_MORE_POSITIONS) {
                            int doc = currentSourceSpans.nextDoc();
                            if (doc != DocIdSetIterator.NO_MORE_DOCS) {
                                // Go to first hit in doc
                                start = currentSourceSpans.nextStartPosition();
                            } else {
                                // This one is exhausted; go to the next one.
                                currentSourceSpans = null;
                        if (currentSourceSpans != null) {
                            // We're at the next hit.

                    // Count the hit and add it (unless we've reached the maximum number of hits we
                    // want)
                    int hitDoc = currentSourceSpans.docID() + currentDocBase;
                    if (hitDoc != previousHitDoc) {
                        if (!maxHitsRetrieved)
                        previousHitDoc = hitDoc;
                    maxHitsRetrieved = maxHitsToRetrieve >= 0 && hits.size() >= maxHitsToRetrieve;
                    if (!maxHitsRetrieved) {
                        Hit hit = currentSourceSpans.getHit();
                        Hit offsetHit = new Hit(hit.doc + currentDocBase, hit.start, hit.end);
                        if (capturedGroups != null) {
                            Span[] groups = new Span[hitQueryContext.numberOfCapturedGroups()];
                            capturedGroups.put(offsetHit, groups);
            } catch (InterruptedException e) {
                maxHitsRetrieved = maxHitsCounted = true; // we've stopped retrieving/counting
                throw e;
            } catch (IOException e) {
                throw new RuntimeException(e);

     * Sort the list of hits.
     * Note that if the thread is interrupted during this, sort may return
     * without the hits actually being fully read and sorted. We don't want
     * to add throws declarations to our whole API, so we assume the calling
     * method will check for thread interruption if the application uses it.
     * @param sortProp
     *            the hit property to sort on
     * @param reverseSort
     *            if true, sort in descending order
     * @param sensitive whether to sort case-sensitively or not
     * @deprecated use sortedBy()
    public synchronized void sort(final HitProperty sortProp, boolean reverseSort, boolean sensitive) {
        try {
        } catch (InterruptedException e) {
            // Thread was interrupted; don't complete the operation but return
            // and let the caller detect and deal with the interruption.

        // Make sure we have a sort order array of sufficient size
        if (sortOrder == null || sortOrder.length < hits.size()) {
            sortOrder = new Integer[hits.size()];
        // Fill the array with the original hit order (0, 1, 2, ...)
        int n = hits.size();
        for (int i = 0; i < n; i++)
            sortOrder[i] = i;

        // If we need context, make sure we have it.
        List<String> requiredContext = sortProp.needsContext();
        if (requiredContext != null)

        // Perform the actual sort.
        Arrays.sort(sortOrder, sortProp);

        if (reverseSort) {
            // Instead of creating a new Comparator that reverses the order of the
            // sort property (which adds an extra layer of indirection to each of the
            // O(n log n) comparisons), just reverse the hits now (which runs
            // in linear time).
            for (int i = 0; i < n / 2; i++) {
                sortOrder[i] = sortOrder[n - i - 1];

     * Determines if there are at least a certain number of hits
     * This may be used if we don't want to process all hits (which
     * may be a lot) but we do need to know something about the size
     * of the result set (such as for paging).
     * Note that this method applies to the hits retrieved, which may
     * be less than the total number of hits (depending on maxHitsToRetrieve).
     * @param lowerBound the number we're testing against
     * @return true if the size of this set is at least lowerBound, false otherwise.
    public boolean sizeAtLeast(int lowerBound) {
        try {
            // Try to fetch at least this many hits
        } catch (InterruptedException e) {
            // Thread was interrupted; abort operation
            // and let client decide what to do

        return hits.size() >= lowerBound;

     * Return the number of hits available.
     * Note that this method applies to the hits retrieved, which may
     * be less than the total number of hits (depending on maxHitsToRetrieve).
     * Use totalSize() to find the total hit count (which may also be limited
     * depending on maxHitsToCount).
     * @return the number of hits available
    public int size() {
        try {
            // Probably not all hits have been seen yet. Collect them all.
        } catch (InterruptedException e) {
            // Thread was interrupted; don't complete the operation but return
            // and let the caller detect and deal with the interruption.
            // Returned value is probably not the correct total number of hits,
            // but will not cause any crashes. The thread was interrupted anyway,
            // the value should never be presented to the user.
            maxHitsCounted = true; // indicate that we've stopped counting
        return hits.size();

     * Return the total number of hits.
     * NOTE: Depending on maxHitsToRetrieve, hit retrieval may stop
     * before all hits are seen. We do keep counting hits though
     * (until we reach maxHitsToCount, or that value is negative).
     * This method returns our total hit count. Some of these hits
     * may not be available.
     * @return the total hit count
    public int totalSize() {
        try {
        } catch (InterruptedException e) {
            // Thread was interrupted; don't complete the operation but return
            // and let the caller detect and deal with the interruption.
            // Returned value is probably not the correct total number of hits,
            // but will not cause any crashes. The thread was interrupted anyway,
            // the value should never be presented to the user.
        return hitsCounted;

     * Return the number of documents in the hits we've retrieved.
     * @return the number of documents.
    public int numberOfDocs() {
        try {
        } catch (InterruptedException e) {
            // Thread was interrupted; don't complete the operation but return
            // and let the caller detect and deal with the interruption.
            // Returned value is probably not the correct total number of hits,
            // but will not cause any crashes. The thread was interrupted anyway,
            // the value should never be presented to the user.
        return docsRetrieved;

     * Return the total number of documents in all hits.
     * This counts documents even in hits that are not stored, only counted.
     * @return the total number of documents.
    public int totalNumberOfDocs() {
        try {
        } catch (InterruptedException e) {
            // Thread was interrupted; don't complete the operation but return
            // and let the caller detect and deal with the interruption.
            // Returned value is probably not the correct total number of hits,
            // but will not cause any crashes. The thread was interrupted anyway,
            // the value should never be presented to the user.
        return docsCounted;

     * Return the number of hits counted so far.
     * If you're retrieving hit in a background thread, call this
     * method from another thread to get an update of the count so far.
     * @return the current total hit count
    public int countSoFarHitsCounted() {
        return hitsCounted;

     * Return the number of hits retrieved so far.
     * If you're retrieving hits in a background thread, call this
     * method from another thread to get an update of the count so far.
     * @return the current total hit count
    public int countSoFarHitsRetrieved() {
        return hits.size();

     * Return the number of documents counted so far.
     * If you're retrieving hit in a background thread, call this
     * method from another thread to get an update of the count so far.
     * @return the current total hit count
    public int countSoFarDocsCounted() {
        return docsCounted;

     * Return the number of documents retrieved so far.
     * If you're retrieving hits in a background thread, call this
     * method from another thread to get an update of the count so far.
     * @return the current total hit count
    public int countSoFarDocsRetrieved() {
        return docsRetrieved;

     * Check if we're done retrieving/counting hits.
     * If you're retrieving hits in a background thread, call this
     * method from another thread to check if all hits have been processed.
     * @return true iff all hits have been retrieved/counted.
    public boolean doneFetchingHits() {
        return sourceSpansFullyRead || maxHitsCounted;

     * Return an iterator over these hits that produces the
     * hits in their original order.
     * @param originalOrder if true, returns hits in original order. If false,
     *   returns them in sorted order (if any)
     * @return the iterator
    public Iterator<Hit> getIterator(final boolean originalOrder) {
        // Construct a custom iterator that iterates over the hits in the hits
        // list, but can also take into account the Spans object that may not have
        // been fully read. This ensures we don't instantiate Hit objects for all hits
        // if we just want to display the first few.
        return new Iterator<Hit>() {

            int index = -1;

            public boolean hasNext() {
                // Do we still have hits in the hits list?
                try {
                    ensureHitsRead(index + 2);
                } catch (InterruptedException e) {
                    // Thread was interrupted. Don't finish reading hits and accept possibly wrong
                    // answer.
                    // Client must detect the interruption and stop the thread.
                return hits.size() >= index + 2;

            public Hit next() {
                // Check if there is a next, taking unread hits from Spans into account
                if (hasNext()) {
                    return hits.get((originalOrder || sortOrder == null) ? index : sortOrder[index]);
                throw new NoSuchElementException();

            public void remove() {
                throw new UnsupportedOperationException();


     * Return the specified hit number, based on the order they
     * were originally found (not the sorted order).
     * @param i
     *            index of the desired hit
     * @return the hit, or null if it's beyond the last hit
    public Hit getByOriginalOrder(int i) {
        try {
            ensureHitsRead(i + 1);
        } catch (InterruptedException e) {
            // Thread was interrupted. Required hit hasn't been gathered;
            // we will just return null.
        if (i >= hits.size())
            return null;
        return hits.get(i);

     * Return the specified hit.
     * @param i
     *            index of the desired hit
     * @return the hit, or null if it's beyond the last hit
    public Hit get(int i) {
        try {
            ensureHitsRead(i + 1);
        } catch (InterruptedException e) {
            // Thread was interrupted. Required hit hasn't been gathered;
            // we will just return null.
        if (i >= hits.size())
            return null;
        return hits.get(sortOrder == null ? i : sortOrder[i]);

     * Retrieve a single KWIC (KeyWord In Context). Only use if you need a larger
     * snippet around a single
     * hit. If you need KWICs for a set of hits, just instantiate a HitsWindow and call
     * getKwic() on that; it will fetch all KWICs in the window in a batch, which
     * is more efficient.
     * @param fieldName field to use for building the KWIC
     * @param hit the hit for which we want a KWIC
     * @param contextSize the desired number of words around the hit
     * @return the KWIC
    public Kwic getKwic(String fieldName, Hit hit, int contextSize) {
        List<Hit> oneHit = Arrays.asList(hit);
        HitsImpl h = new HitsImpl(searcher, searcher.getMainContentsFieldName(), oneHit);
        h.copySettingsFrom(this); // concordance type, etc.
        Map<Hit, Kwic> oneConc = h.retrieveKwics(contextSize, fieldName);
        return oneConc.get(hit);

     * Get a KWIC with a custom context size.
     * Don't call this directly for displaying a list of results. In that case,
     * just instantiate a HitsWindow, call setContextSize() on it to set a
     * default context size and call getKwic(Hit) for each hit. That's
     * more efficient if you're dealing with many hits.
     * This method is mostly just for getting a larger snippet around
     * a single hit.
     * @param h the hit
     * @param contextSize the context size for this KWIC
     *   (only use if you want a different one than the preset preference)
     * @return KWIC for this hit
    public Kwic getKwic(Hit h, int contextSize) {
        if (contextSize != settings().contextSize()) {
            // Different context size than the default for the whole set;
            // We probably want to show a hit with a larger snippet around it
            // (say, 50 words or so). Don't clobber the context of the other
            // hits, just fetch this snippet separately.
            return getKwic(settings().concordanceField(), h, contextSize);

        // Default context size. Read all hits and find concordances for all of them
        // in batch.
        try {
        } catch (InterruptedException e) {
            // Thread was interrupted. Just go ahead with the hits we did
            // get, so at least we can return a valid concordance object and
            // not break the calling method. It is responsible for checking
            // for thread interruption (only some applications use this at all,
            // so throwing exceptions from all methods is too inconvenient)
        if (kwics == null) {
            findKwics(); // just try to find the default concordances
        Kwic kwic = kwics.get(h);
        if (kwic == null)
            throw new RuntimeException("KWIC for hit not found: " + h);
        return kwic;

     * Retrieve a single concordance. Only use if you need a larger snippet around a single
     * hit. If you need concordances for a set of hits, just instantiate a HitsWindow and call
     * getConcordance() on that; it will fetch all concordances in the window in a batch, which
     * is more efficient.
     * @param fieldName field to use for building the concordance
     * @param hit the hit for which we want a concordance
     * @param contextSize the desired number of words around the hit
     * @return the concordance
    public synchronized Concordance getConcordance(String fieldName, Hit hit, int contextSize) {
        List<Hit> oneHit = Arrays.asList(hit);
        HitsImpl h = new HitsImpl(searcher, searcher.getMainContentsFieldName(), oneHit);
        h.copySettingsFrom(this); // concordance type, etc.
        if (settings().concordanceType() == ConcordanceType.FORWARD_INDEX) {
            Map<Hit, Kwic> oneKwic = h.retrieveKwics(contextSize, fieldName);
            return oneKwic.get(hit).toConcordance();
        Map<Hit, Concordance> oneConc = h.retrieveConcordancesFromContentStore(contextSize, fieldName);
        return oneConc.get(hit);

     * Get a concordance with a custom context size.
     * Don't call this directly for displaying a list of results. In that case,
     * just instantiate a HitsWindow, call setContextSize() on it to set a
     * default context size and call getConcordance(Hit) for each hit. That's
     * more efficient if you're dealing with many hits.
     * This method is mostly just for getting a larger snippet around
     * a single hit.
     * @param h the hit
     * @param contextSize the context size for this concordance
     *   (only use if you want a different one than the preset preference)
     * @return concordance for this hit
    public synchronized Concordance getConcordance(Hit h, int contextSize) {
        if (settings().concordanceType() == ConcordanceType.FORWARD_INDEX)
            return getKwic(h, contextSize).toConcordance();

        if (contextSize != settings().contextSize()) {
            // Different context size than the default for the whole set;
            // We probably want to show a hit with a larger snippet around it
            // (say, 50 words or so). Don't clobber the context of the other
            // hits, just fetch this snippet separately.
            return getConcordance(settings().concordanceField(), h, contextSize);

        // Default context size. Read all hits and find concordances for all of them
        // in batch.
        try {
        } catch (InterruptedException e) {
            // Thread was interrupted. Just go ahead with the hits we did
            // get, so at least we can return a valid concordance object and
            // not break the calling method. It is responsible for checking
            // for thread interruption (only some applications use this at all,
            // so throwing exceptions from all methods is too inconvenient)
        if (concordances == null) {
            findConcordances(); // just try to find the default concordances
        Concordance conc = concordances.get(h);
        if (conc == null)
            throw new RuntimeException("Concordance for hit not found: " + h);
        return conc;

     * Retrieve concordances for the hits.
     * You shouldn't have to call this manually, as it's automatically called when
     * you call getConcordance() for the first time.
    synchronized void findConcordances() {
        if (settings.concordanceType() == ConcordanceType.FORWARD_INDEX) {

        try {
        } catch (InterruptedException e) {
            // Thread was interrupted. Just go ahead with the hits we did
            // get, so at least we'll have valid concordances.
        // Make sure we don't have the desired concordances already
        if (concordances != null) {

        // Get the concordances
        concordances = retrieveConcordancesFromContentStore(settings().contextSize(),

     * Retrieve KWICs for the hits.
     * You shouldn't have to call this manually, as it's automatically called when
     * you call getKwic() for the first time.
    synchronized void findKwics() {
        try {
        } catch (InterruptedException e) {
            // Thread was interrupted. Just go ahead with the hits we did
            // get, so at least we'll have valid concordances.
        // Make sure we don't have the desired concordances already
        if (kwics != null) {

        // Get the concordances
        kwics = retrieveKwics(settings().contextSize(), settings().concordanceField());

     * Retrieve KWICs for a (sub)list of hits.
     * KWICs are the hit words 'centered' with a certain number of context words around them.
     * The size of the left and right context (in words) may be set using
     * Searcher.setConcordanceContextSize().
     * @param contextSize
     *            how many words around the hit to retrieve
     * @param fieldName
     *            field to use for building KWICs
     * @return the KWICs
    private Map<Hit, Kwic> retrieveKwics(int contextSize, String fieldName) {

        // Group hits per document
        MutableIntObjectMap<List<Hit>> hitsPerDocument = IntObjectMaps.mutable.empty();
        for (Hit key : this) {
            List<Hit> hitsInDoc = hitsPerDocument.get(key.doc);
            if (hitsInDoc == null) {
                hitsInDoc = new ArrayList<>();
                hitsPerDocument.put(key.doc, hitsInDoc);

        if (settings().concordanceType() == ConcordanceType.FORWARD_INDEX) {
            // Yes, make 'em from the forward index (faster)
            ForwardIndex forwardIndex = null;
            String concWordFI = settings().concWordProp();
            if (concWordFI != null)
                forwardIndex = searcher.getForwardIndex(ComplexFieldUtil.propertyField(fieldName, concWordFI));

            ForwardIndex punctForwardIndex = null;
            String concPunctFI = settings().concPunctProp();
            if (concPunctFI != null)
                punctForwardIndex = searcher
                        .getForwardIndex(ComplexFieldUtil.propertyField(fieldName, concPunctFI));

            Map<String, ForwardIndex> attrForwardIndices = new HashMap<>();
            Collection<String> concAttrFI = settings().concAttrProps();
            if (concAttrFI == null) {
                // All other FIs are attributes
                for (String p : searcher.getForwardIndices().keySet()) {
                    String[] components = ComplexFieldUtil.getNameComponents(p);
                    String propName = components[1];
                    if (propName.equals(concWordFI) || propName.equals(concPunctFI))
                    attrForwardIndices.put(propName, searcher.getForwardIndex(p));
            } else {
                // Specific list of attribute FIs
                for (String p : concAttrFI) {
                            searcher.getForwardIndex(ComplexFieldUtil.propertyField(fieldName, p)));

            Map<Hit, Kwic> conc1 = new HashMap<>();
            for (List<Hit> l : hitsPerDocument.values()) {
                HitsImpl hitsInThisDoc = new HitsImpl(searcher, searcher.getMainContentsFieldName(), l);
                hitsInThisDoc.makeKwicsSingleDocForwardIndex(forwardIndex, punctForwardIndex, attrForwardIndices,
                        contextSize, conc1);
            return conc1;

        throw new UnsupportedOperationException(
                "Concordance type is set to CONTENT_STORE, but you can only make KWICs from the forward index. NOTE: if your index has no 'punct' property, concordance type will default to CONTENT_STORE instead of FORWARD_INDEX.");

     * Retrieve context words for the hits.
     * @param fieldProps
     *            the field and properties to use for the context
    public synchronized void findContext(List<String> fieldProps) {
        try {
        } catch (InterruptedException e) {
            // Thread was interrupted. Just go ahead with the hits we did
            // get, so at least we can return with valid context.
        // Make sure we don't have the desired context already
        if (contextFieldsPropName != null && fieldProps.equals(contextFieldsPropName)
                && settings().contextSize() == currentContextSize) {

        List<ForwardIndex> fis = new ArrayList<>();
        for (String fieldPropName : fieldProps) {

        // Get the context
        // Group hits per document
        List<Hit> hitsInSameDoc = new ArrayList<>();
        int currentDoc = -1;
        int index = 0;
        if (contexts == null || contexts.length < hits.size()) {
            contexts = new int[hits.size()][];
        for (Hit hit : hits) {
            if (hit.doc != currentDoc) {
                if (currentDoc >= 0) {
                    try {
                    } catch (InterruptedException e) {
                        // Thread was interrupted. Just go ahead with the hits we did
                        // get, so at least we can return with valid context.

                    findPartOfContext(hitsInSameDoc, index - hitsInSameDoc.size(), fis);

                    // Reset hits list for next doc
                currentDoc = hit.doc; // start a new document
        if (!hitsInSameDoc.isEmpty())
            findPartOfContext(hitsInSameDoc, index - hitsInSameDoc.size(), fis);

        currentContextSize = settings().contextSize();
        contextFieldsPropName = new ArrayList<>(fieldProps);

     * Helper method for findContext(). Finds the hits in a single document and adds
     * context to our contexts array.
     * @param hitsInSameDoc the hits in one document
     * @param firstHitIndex index of the first hit
     * @param fis forward indices needed for contexts
    private void findPartOfContext(List<Hit> hitsInSameDoc, int firstHitIndex, List<ForwardIndex> fis) {
        // Find context for the hits in the current document
        HitsImpl hitsObj = new HitsImpl(searcher, searcher.getMainContentsFieldName(), hitsInSameDoc);
        hitsObj.getContextWords(settings().contextSize(), fis);

        // Copy the contexts from the temporary Hits object to this one
        for (int i = 0; i < hitsInSameDoc.size(); i++) {
            contexts[firstHitIndex + i] = hitsObj.getHitContext(i);

     * Count occurrences of context words around hit.
     * @param propName the property to use for the collocations, or null if default
     * @param ctx query execution context, containing the sensitivity settings
     * @return the frequency of each occurring token
    public synchronized TermFrequencyList getCollocations(String propName, QueryExecutionContext ctx) {
        if (propName == null)
            propName = searcher.getIndexStructure().getMainContentsField().getMainProperty().getName();
        if (ctx == null)
            ctx = searcher.getDefaultExecutionContext(settings().concordanceField());
        ctx = ctx.withProperty(propName);
        MutableIntIntMap coll = IntIntMaps.mutable.empty();
        for (int j = 0; j < hits.size(); j++) {
            int[] context = contexts[j];

            // Count words
            int contextHitStart = context[CONTEXTS_HIT_START_INDEX];
            int contextRightStart = context[CONTEXTS_RIGHT_START_INDEX];
            int contextLength = context[CONTEXTS_LENGTH_INDEX];
            int indexInContent = CONTEXTS_NUMBER_OF_BOOKKEEPING_INTS;
            for (int i = 0; i < contextLength; i++, indexInContent++) {
                if (i >= contextHitStart && i < contextRightStart)
                    continue; // don't count words in hit itself, just around [option..?]
                int w = context[indexInContent];
                int n;
                if (!coll.contains(w))
                    n = 1;
                    n = coll.get(w) + 1;
                coll.put(w, n);

        // Get the actual words from the sort positions
        boolean caseSensitive = searcher.isDefaultSearchCaseSensitive();
        boolean diacSensitive = searcher.isDefaultSearchDiacriticsSensitive();
        TermFrequencyList collocations = new TermFrequencyList(coll.size());
        Terms terms = searcher.getTerms(contextFieldsPropName.get(0));
        Map<String, Integer> wordFreq = new HashMap<>();
        for (IntIntPair e : coll.keyValuesView()) {
            int key = e.getOne();
            int value = e.getTwo();
            String word = terms.get(key);
            if (!diacSensitive) {
                word = StringUtils.stripAccents(word);
            if (!caseSensitive) {
                word = word.toLowerCase();
            // Note that multiple ids may map to the same word (because of sensitivity settings)
            // Here, those groups are merged.
            Integer n = wordFreq.get(word);
            if (n == null) {
                n = 0;
            n += value;
            wordFreq.put(word, n);

        // Transfer from map to list
        for (Map.Entry<String, Integer> e : wordFreq.entrySet()) {
            collocations.add(new TermFrequency(e.getKey(), e.getValue()));
        return collocations;

    public boolean hasCapturedGroups() {
        return capturedGroups != null;

     * Get the captured group information for this hit, if any.
     * The names of the captured groups can be obtained through
     * the getCapturedGroupNames() method.
     * @param hit the hit to get captured group information for
     * @return the captured group information, or null if none
    public Span[] getCapturedGroups(Hit hit) {
        if (capturedGroups == null)
            return null;
        return capturedGroups.get(hit);

     * Get the captured group information in map form.
     * Relatively slow; use getCapturedGroups() and getCapturedGroupNames()
     * for a faster alternative.
     * @param hit hit to get the captured group map for
     * @return the captured group information map
    public Map<String, Span> getCapturedGroupMap(Hit hit) {
        if (capturedGroups == null)
            return null;
        Map<String, Span> result = new TreeMap<>(); // TreeMap to maintain group ordering
        List<String> names = getCapturedGroupNames();
        Span[] groups = capturedGroups.get(hit);
        for (int i = 0; i < names.size(); i++) {
            result.put(names.get(i), groups[i]);
        return result;

     * Get the field our current concordances were retrieved from
     * @return the field name
    public List<String> getContextFieldPropName() {
        return contextFieldsPropName;

     * Set the field properties to retrieve context from
     * @param contextField the field properties
    public void setContextField(List<String> contextField) {
        this.contextFieldsPropName = contextField == null ? null : new ArrayList<>(contextField);

     * Retrieves the KWIC information (KeyWord In Context: left, hit and right context) for
     * a number of hits in the same document from the ContentStore.
     * @param forwardIndex
     *    Forward index for the words
     * @param punctForwardIndex
     *    Forward index for the punctuation
     * @param attrForwardIndices
     *    Forward indices for the attributes, or null if none
     * @param wordsAroundHit
     *            number of words left and right of hit to fetch
     * @param theKwics
     *            where to add the KWICs
    synchronized void makeKwicsSingleDocForwardIndex(ForwardIndex forwardIndex, ForwardIndex punctForwardIndex,
            Map<String, ForwardIndex> attrForwardIndices, int wordsAroundHit, Map<Hit, Kwic> theKwics) {
        if (hits.isEmpty())

        // Save existing context so we can restore it afterwards
        int[][] oldContexts = null;
        if (!hits.isEmpty() && contexts != null)
            oldContexts = saveContexts();

        // TODO: more efficient to get all contexts with one getContextWords() call!

        // Get punctuation context
        int[][] punctContext = null;
        if (punctForwardIndex != null) {
            getContextWords(wordsAroundHit, Arrays.asList(punctForwardIndex));
            punctContext = saveContexts();
        Terms punctTerms = punctForwardIndex == null ? null : punctForwardIndex.getTerms();

        // Get attributes context
        String[] attrName = null;
        Terms[] attrTerms = null;
        int[][][] attrContext = null;
        if (attrForwardIndices != null) {
            int n = attrForwardIndices.size();
            attrName = new String[n];
            ForwardIndex[] attrFI = new ForwardIndex[n];
            attrTerms = new Terms[n];
            attrContext = new int[n][][];
            int i = 0;
            for (Map.Entry<String, ForwardIndex> e : attrForwardIndices.entrySet()) {
                attrName[i] = e.getKey();
                attrFI[i] = e.getValue();
                attrTerms[i] = attrFI[i].getTerms();
                getContextWords(wordsAroundHit, Arrays.asList(attrFI[i]));
                attrContext[i] = saveContexts();

        // Get word context
        if (forwardIndex != null)
            getContextWords(wordsAroundHit, Arrays.asList(forwardIndex));
        Terms terms = forwardIndex == null ? null : forwardIndex.getTerms();

        // Make the concordances from the context
        String concPunctFI = settings().concPunctProp();
        String concWordFI = settings().concWordProp();
        for (int i = 0; i < hits.size(); i++) {
            Hit h = hits.get(i);
            List<String> tokens = new ArrayList<>();
            int[] context = contexts[i];
            int contextLength = context[CONTEXTS_LENGTH_INDEX];
            int contextRightStart = context[CONTEXTS_RIGHT_START_INDEX];
            int contextHitStart = context[CONTEXTS_HIT_START_INDEX];
            int indexInContext = CONTEXTS_NUMBER_OF_BOOKKEEPING_INTS;
            for (int j = 0; j < contextLength; j++, indexInContext++) {

                // Add punctuation before word
                // (Applications may choose to ignore punctuation before the first word)
                if (punctTerms == null) {
                    // There is no punctuation forward index. Just put a space
                    // between every word.
                    tokens.add(" ");
                } else

                // Add extra attributes (e.g. lemma, pos)
                if (attrContext != null) {
                    for (int k = 0; k < attrContext.length; k++) {

                // Add word
                if (terms != null)
                    tokens.add(""); // weird, but make sure the numbers add up at the end

            List<String> properties = new ArrayList<>();
            for (int k = 0; k < attrContext.length; k++) {
            Kwic kwic = new Kwic(properties, tokens, contextHitStart, contextRightStart);
            theKwics.put(h, kwic);

        if (oldContexts != null) {

     * Get context words from the forward index.
     * NOTE: not synchronized because only ever called from synchronized methods!
     * @param wordsAroundHit how many words of context we want
     * @param contextSources
     *            forward indices to get context from
    private void getContextWords(int wordsAroundHit, List<ForwardIndex> contextSources) {

        int n = hits.size();
        if (n == 0)
        int[] startsOfSnippets = new int[n];
        int[] endsOfSnippets = new int[n];
        int i = 0;
        for (Hit h : hits) {
            startsOfSnippets[i] = wordsAroundHit >= h.start ? 0 : h.start - wordsAroundHit;
            endsOfSnippets[i] = h.end + wordsAroundHit;

        int fiNumber = 0;
        int doc = hits.get(0).doc;
        for (ForwardIndex forwardIndex : contextSources) {
            // Get all the words from the forward index
            List<int[]> words;
            if (forwardIndex != null) {
                // We have a forward index for this field. Use it.
                int fiid = forwardIndex.luceneDocIdToFiid(doc);
                words = forwardIndex.retrievePartsInt(fiid, startsOfSnippets, endsOfSnippets);
            } else {
                throw new RuntimeException("Cannot get context without a forward index");

            // Build the actual concordances
            Iterator<int[]> wordsIt = words.iterator();
            int hitNum = 0;
            if (contexts == null || contexts.length < hits.size()) {
                contexts = new int[hits.size()][];
            for (Hit hit : hits) {
                int[] theseWords =;

                // Put the concordance in the Hit object
                int firstWordIndex = startsOfSnippets[hitNum];

                if (fiNumber == 0) {
                    // Allocate context array and set hit and right start and context length
                    contexts[hitNum] = new int[CONTEXTS_NUMBER_OF_BOOKKEEPING_INTS
                            + theseWords.length * contextSources.size()];
                    contexts[hitNum][CONTEXTS_HIT_START_INDEX] = hit.start - firstWordIndex;
                    contexts[hitNum][CONTEXTS_RIGHT_START_INDEX] = hit.end - firstWordIndex;
                    contexts[hitNum][CONTEXTS_LENGTH_INDEX] = theseWords.length;
                // Copy the context we just retrieved into the context array
                int start = fiNumber * theseWords.length + CONTEXTS_NUMBER_OF_BOOKKEEPING_INTS;
                System.arraycopy(theseWords, 0, contexts[hitNum], start, theseWords.length);


     * Get the context information from the list of hits, so we can
     * look up a different context but still have access to this one as well.
     * NOTE: not synchronized because only ever called from synchronized methods!
     * @return the context
    private int[][] saveContexts() {
        int[][] saved = new int[contexts.length][];
        for (int i = 0; i < contexts.length; i++) {
            saved[i] = Arrays.copyOf(contexts[i], contexts[i].length);
        return saved;

     * Put context information into the list of hits.
     * NOTE: not synchronized because only ever called from synchronized methods!
     * @param saved the context to restore
    private void restoreContexts(int[][] saved) {
        if (contexts == null || contexts.length != saved.length) {
            contexts = new int[saved.length][];
        for (int i = 0; i < saved.length; i++) {
            if (contexts[i] == null || contexts[i].length != saved[i].length) {
                contexts[i] = new int[saved[i].length];
            System.arraycopy(saved[i], 0, contexts, 0, saved[i].length);

     * Retrieves the concordance information (left, hit and right context) for a number of hits in
     * the same document from the ContentStore.
     * NOTE1: it is assumed that all hits in this Hits object are in the same document!
     * @param fieldName
     *            Lucene index field to make conc for
     * @param wordsAroundHit
     *            number of words left and right of hit to fetch
     * @param conc
     *            where to add the concordances
     * @param hl
    private synchronized void makeConcordancesSingleDocContentStore(String fieldName, int wordsAroundHit,
            Map<Hit, Concordance> conc, XmlHighlighter hl) {
        if (hits.isEmpty())
        int doc = hits.get(0).doc;
        int arrayLength = hits.size() * 2;
        int[] startsOfWords = new int[arrayLength];
        int[] endsOfWords = new int[arrayLength];

        // Determine the first and last word of the concordance, as well as the
        // first and last word of the actual hit inside the concordance.
        int startEndArrayIndex = 0;
        for (Hit hit : hits) {
            int hitStart = hit.start;
            int hitEnd = hit.end - 1;

            int start = hitStart - wordsAroundHit;
            if (start < 0)
                start = 0;
            int end = hitEnd + wordsAroundHit;

            startsOfWords[startEndArrayIndex] = start;
            startsOfWords[startEndArrayIndex + 1] = hitStart;
            endsOfWords[startEndArrayIndex] = hitEnd;
            endsOfWords[startEndArrayIndex + 1] = end;

            startEndArrayIndex += 2;

        // Get the relevant character offsets (overwrites the startsOfWords and endsOfWords
        // arrays)
        searcher.getCharacterOffsets(doc, fieldName, startsOfWords, endsOfWords, true);

        // Make all the concordances
        List<Concordance> newConcs = searcher.makeConcordancesFromContentStore(doc, fieldName, startsOfWords,
                endsOfWords, hl);
        for (int i = 0; i < hits.size(); i++) {
            conc.put(hits.get(i), newConcs.get(i));

     * Convenience method to get all hits in a single doc from a larger hitset.
     * Don't use this for grouping or per-document results as it's relatively inefficient.
     * @param docid the doc id to get hits for
     * @return the list of hits in this doc (if any)
    public Hits getHitsInDoc(int docid) {
        try {
        } catch (InterruptedException e) {
            // Interrupted. Just return no hits;
            // client should detect thread was interrupted if it
            // wants to use background threads.
            return Hits.emptyList(searcher);
        List<Hit> hitsInDoc = new ArrayList<>();
        for (Hit hit : hits) {
            if (hit.doc == docid)
        Hits result = Hits.fromList(searcher, hitsInDoc);
        return result;

     * Return the context(s) for the specified hit number
     * @param hitNumber which hit we want the context(s) for
     * @return the context(s)
    public int[] getHitContext(int hitNumber) {
        return contexts[hitNumber];

     * Generate concordances from content store (slower).
     * @param hits the hits for which to retrieve concordances
     * @param contextSize how many words around the hit to retrieve
     * @param fieldName field to use for building concordances
     * @return the concordances
    private Map<Hit, Concordance> retrieveConcordancesFromContentStore(int contextSize, String fieldName) {
        XmlHighlighter hl = new XmlHighlighter(); // used to make fragments well-formed
        // Group hits per document
        MutableIntObjectMap<List<Hit>> hitsPerDocument = IntObjectMaps.mutable.empty();
        for (Hit key : hits) {
            List<Hit> hitsInDoc = hitsPerDocument.get(key.doc);
            if (hitsInDoc == null) {
                hitsInDoc = new ArrayList<>();
                hitsPerDocument.put(key.doc, hitsInDoc);
        Map<Hit, Concordance> conc = new HashMap<>();
        for (List<Hit> l : hitsPerDocument.values()) {
            HitsImpl hitsInThisDoc = new HitsImpl(searcher, l);
            hitsInThisDoc.makeConcordancesSingleDocContentStore(fieldName, contextSize, conc, hl);
        return conc;

    public void setMaxHitsCounted(boolean maxHitsCounted) {
        this.maxHitsCounted = maxHitsCounted;

    protected void setMaxHitsRetrieved(boolean maxHitsRetrieved) {
        this.maxHitsRetrieved = maxHitsRetrieved;
