Java tutorial
/* * Copyright (c) 2002-2016 "Neo Technology," * Network Engine for Objects in Lund AB [http://neotechnology.com] * * This file is part of Neo4j. * * Neo4j is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.neo4j.kernel.api.impl.index.collector; import org.apache.lucene.document.Document; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.search.Collector; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.SimpleCollector; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.DocIdSetBuilder; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.neo4j.collection.primitive.PrimitiveLongCollections; import org.neo4j.collection.primitive.PrimitiveLongIterator; import org.neo4j.graphdb.index.IndexHits; import org.neo4j.helpers.collection.ArrayIterator; import org.neo4j.helpers.collection.PrefetchingIterator; import org.neo4j.index.impl.lucene.legacy.AbstractIndexHits; import org.neo4j.index.impl.lucene.legacy.EmptyIndexHits; /** * Collector to record per-segment {@code DocIdSet}s and {@code LeafReaderContext}s for every * segment that contains a hit. Those items can be later used to read {@code DocValues} fields * and iterate over the matched {@code DocIdSet}s. This collector is different from * {@code org.apache.lucene.search.CachingCollector} in that the later focuses on predictable RAM usage * and feeding other collectors while this collector focuses on exposing the required per-segment data structures * to the user. */ public class DocValuesCollector extends SimpleCollector { private static final EmptyIndexHits<Document> EMPTY_INDEX_HITS = new EmptyIndexHits<>(); private LeafReaderContext context; private int segmentHits; private int totalHits; private Scorer scorer; private float[] scores; private final boolean keepScores; private final List<MatchingDocs> matchingDocs = new ArrayList<>(); private Docs docs; /** * Default Constructor, does not keep scores. */ public DocValuesCollector() { this(false); } /** * @param keepScores true if you want to trade correctness for speed */ public DocValuesCollector(boolean keepScores) { this.keepScores = keepScores; } /** * @param field the field that contains the values * @return an iterator over all NumericDocValues from the given field */ public LongValuesIterator getValuesIterator(String field) { return new LongValuesIterator(getMatchingDocs(), getTotalHits(), field); } /** * @param field the field that contains the values * @param sort how the results should be sorted * @return an iterator over all NumericDocValues from the given field with respect to the given sort * @throws IOException */ public PrimitiveLongIterator getSortedValuesIterator(String field, Sort sort) throws IOException { if (sort == null || sort == Sort.INDEXORDER) { return getValuesIterator(field); } int size = getTotalHits(); if (size == 0) { return PrimitiveLongCollections.emptyIterator(); } TopDocs topDocs = getTopDocs(sort, size); LeafReaderContext[] contexts = getLeafReaderContexts(getMatchingDocs()); return new TopDocsValuesIterator(topDocs, contexts, field); } /** * Replay the search and collect every hit into TopDocs. One {@code ScoreDoc} is allocated * for every hit and the {@code Document} instance is loaded lazily with on every iteration step. * * @param sort how to sort the iterator. If this is null, results will be in index-order. * @return an indexhits iterator over all matches * @throws IOException */ public IndexHits<Document> getIndexHits(Sort sort) throws IOException { List<MatchingDocs> matchingDocs = getMatchingDocs(); int size = getTotalHits(); if (size == 0) { return EMPTY_INDEX_HITS; } if (sort == null || sort == Sort.INDEXORDER) { return new DocsInIndexOrderIterator(matchingDocs, size, isKeepScores()); } TopDocs topDocs = getTopDocs(sort, size); LeafReaderContext[] contexts = getLeafReaderContexts(matchingDocs); return new TopDocsIterator(topDocs, contexts); } /** * @return the total number of hits across all segments. */ public int getTotalHits() { return totalHits; } /** * @return true if scores were saved. */ public boolean isKeepScores() { return keepScores; } @Override public final void collect(int doc) throws IOException { docs.addDoc(doc); if (keepScores) { if (segmentHits >= scores.length) { float[] newScores = new float[ArrayUtil.oversize(segmentHits + 1, 4)]; System.arraycopy(scores, 0, newScores, 0, segmentHits); scores = newScores; } scores[segmentHits] = scorer.score(); } segmentHits++; totalHits++; } @Override public boolean needsScores() { return keepScores; } @Override public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; } @Override public void doSetNextReader(LeafReaderContext context) throws IOException { if (docs != null && segmentHits > 0) { createMatchingDocs(); } int maxDoc = context.reader().maxDoc(); docs = createDocs(maxDoc); if (keepScores) { int initialSize = Math.min(32, maxDoc); scores = new float[initialSize]; } segmentHits = 0; this.context = context; } /** * @return the documents matched by the query, one {@link MatchingDocs} per visited segment that contains a hit. */ public List<MatchingDocs> getMatchingDocs() { if (docs != null && segmentHits > 0) { createMatchingDocs(); docs = null; scores = null; context = null; } return Collections.unmodifiableList(matchingDocs); } /** * @return a new {@link Docs} to record hits. */ private Docs createDocs(final int maxDoc) { return new Docs(maxDoc); } private void createMatchingDocs() { if (scores == null || scores.length == segmentHits) { matchingDocs.add(new MatchingDocs(this.context, docs.getDocIdSet(), segmentHits, scores)); } else { // NOTE: we could skip the copy step here since the MatchingDocs are supposed to be // consumed through any of the provided Iterators (actually, the replay method), // which all don't care if scores has null values at the end. // This is for just sanity's sake, we could also make MatchingDocs private // and treat this as implementation detail. float[] finalScores = new float[segmentHits]; System.arraycopy(scores, 0, finalScores, 0, segmentHits); matchingDocs.add(new MatchingDocs(this.context, docs.getDocIdSet(), segmentHits, finalScores)); } } private TopDocs getTopDocs(Sort sort, int size) throws IOException { TopDocs topDocs; if (sort == Sort.RELEVANCE) { TopScoreDocCollector collector = TopScoreDocCollector.create(size); replayTo(collector); topDocs = collector.topDocs(); } else { TopFieldCollector collector = TopFieldCollector.create(sort, size, false, true, false); replayTo(collector); topDocs = collector.topDocs(); } return topDocs; } private static LeafReaderContext[] getLeafReaderContexts(List<MatchingDocs> matchingDocs) { int segments = matchingDocs.size(); LeafReaderContext[] contexts = new LeafReaderContext[segments]; for (int i = 0; i < segments; i++) { MatchingDocs matchingDoc = matchingDocs.get(i); contexts[i] = matchingDoc.context; } return contexts; } private void replayTo(Collector collector) throws IOException { for (MatchingDocs docs : getMatchingDocs()) { LeafCollector leafCollector = collector.getLeafCollector(docs.context); Scorer scorer; DocIdSetIterator idIterator = docs.docIdSet.iterator(); if (isKeepScores()) { scorer = new ReplayingScorer(docs.scores); } else { scorer = new ConstantScoreScorer(null, Float.NaN, idIterator); } leafCollector.setScorer(scorer); int doc; while ((doc = idIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { leafCollector.collect(doc); } } } /** * Iterates over all per-segment {@link DocValuesCollector.MatchingDocs}. Supports two kinds of lookups. * One, iterate over all long values of the given field (constructor argument). * Two, lookup a value for the current doc in a sidecar {@code NumericDocValues} field. * That is, this iterator has a main field, that drives the iteration and allow for lookups * in other, secondary fields based on the current document of the main iteration. * * Lookups from this class are not thread-safe. Races can happen when the segment barrier * is crossed; one thread might think it is reading from one segment while another thread has * already advanced this Iterator to the next segment, having raced the first thread. */ public class LongValuesIterator extends PrimitiveLongCollections.PrimitiveLongBaseIterator implements DocValuesAccess { private final Iterator<DocValuesCollector.MatchingDocs> matchingDocs; private final String field; private final int size; private DocIdSetIterator currentIdIterator; private NumericDocValues currentDocValues; private DocValuesCollector.MatchingDocs currentDocs; private final Map<String, NumericDocValues> docValuesCache; private int index = 0; /** * @param allMatchingDocs all {@link DocValuesCollector.MatchingDocs} across all segments * @param totalHits the total number of hits across all segments * @param field the main field, whose values drive the iteration */ public LongValuesIterator(Iterable<DocValuesCollector.MatchingDocs> allMatchingDocs, int totalHits, String field) { this.size = totalHits; this.field = field; matchingDocs = allMatchingDocs.iterator(); docValuesCache = new HashMap<>(); } /** * @return the number of docs left in this iterator. */ public int remaining() { return size - index; } @Override public long current() { return next; } @Override public long getValue(String field) { if (ensureValidDisi()) { if (docValuesCache.containsKey(field)) { return docValuesCache.get(field).get(currentIdIterator.docID()); } NumericDocValues docValues = currentDocs.readDocValues(field); docValuesCache.put(field, docValues); return docValues.get(currentIdIterator.docID()); } else { // same as DocValues.emptyNumeric()#get // which means, getValue carries over the semantics of NDV // -1 would also be a possibility here. return 0; } } @Override protected boolean fetchNext() { try { if (ensureValidDisi()) { int nextDoc = currentIdIterator.nextDoc(); if (nextDoc != DocIdSetIterator.NO_MORE_DOCS) { index++; return next(currentDocValues.get(nextDoc)); } else { currentIdIterator = null; return fetchNext(); } } } catch (IOException e) { throw new RuntimeException(e); } return false; } /** * @return true if it was able to make sure, that currentDisi is valid */ private boolean ensureValidDisi() { try { while (currentIdIterator == null) { if (matchingDocs.hasNext()) { currentDocs = matchingDocs.next(); currentIdIterator = currentDocs.docIdSet.iterator(); if (currentIdIterator != null) { docValuesCache.clear(); currentDocValues = currentDocs.readDocValues(field); } } else { return false; } } return true; } catch (IOException e) { throw new RuntimeException(e); } } } /** * Holds the documents that were matched per segment. */ static final class MatchingDocs { /** The {@code LeafReaderContext} for this segment. */ public final LeafReaderContext context; /** Which documents were seen. */ public final DocIdSet docIdSet; /** Non-sparse scores array. Might be null of no scores were required. */ public final float[] scores; /** Total number of hits */ public final int totalHits; public MatchingDocs(LeafReaderContext context, DocIdSet docIdSet, int totalHits, float[] scores) { this.context = context; this.docIdSet = docIdSet; this.totalHits = totalHits; this.scores = scores; } /** * @return the {@code NumericDocValues} for a given field * @throws IllegalArgumentException if this field is not indexed with numeric doc values */ public NumericDocValues readDocValues(String field) { try { NumericDocValues dv = context.reader().getNumericDocValues(field); if (dv == null) { FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field); DocValuesType actual = null; if (fi != null) { actual = fi.getDocValuesType(); } throw new IllegalStateException("The field '" + field + "' is not indexed properly, expected NumericDV, but got '" + actual + "'"); } return dv; } catch (IOException e) { throw new RuntimeException(e); } } } /** * Used during collection to record matching docs and then return a * {@see DocIdSet} that contains them. */ private static final class Docs { private final DocIdSetBuilder bits; public Docs(int maxDoc) { bits = new DocIdSetBuilder(maxDoc); } /** Record the given document. */ public void addDoc(int docId) { bits.add(docId); } /** Return the {@see DocIdSet} which contains all the recorded docs. */ public DocIdSet getDocIdSet() { return bits.build(); } } private static class ReplayingScorer extends Scorer { private final float[] scores; private int index = 0; public ReplayingScorer(float[] scores) { super(null); this.scores = scores; } @Override public float score() throws IOException { if (index < scores.length) { return scores[index++]; } return Float.NaN; } @Override public int freq() throws IOException { throw new UnsupportedOperationException(); } @Override public DocIdSetIterator iterator() { throw new UnsupportedOperationException(); } @Override public int docID() { throw new UnsupportedOperationException(); } } private static final class DocsInIndexOrderIterator extends AbstractIndexHits<Document> { private final Iterator<MatchingDocs> docs; private final int size; private final boolean keepScores; private DocIdSetIterator currentIdIterator; private Scorer currentScorer; private LeafReader currentReader; private DocsInIndexOrderIterator(List<MatchingDocs> docs, int size, boolean keepScores) { this.size = size; this.keepScores = keepScores; this.docs = docs.iterator(); } public int size() { return size; } @Override public float currentScore() { try { return currentScorer.score(); } catch (IOException e) { throw new RuntimeException(e); } } @Override protected Document fetchNextOrNull() { if (ensureValidDisi()) { try { int doc = currentIdIterator.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { currentIdIterator = null; currentScorer = null; currentReader = null; return fetchNextOrNull(); } return currentReader.document(doc); } catch (IOException e) { throw new RuntimeException(e); } } else { return null; } } private boolean ensureValidDisi() { while (currentIdIterator == null && docs.hasNext()) { MatchingDocs matchingDocs = docs.next(); try { currentIdIterator = matchingDocs.docIdSet.iterator(); if (keepScores) { currentScorer = new ReplayingScorer(matchingDocs.scores); } else { currentScorer = new ConstantScoreScorer(null, Float.NaN, currentIdIterator); } currentReader = matchingDocs.context.reader(); } catch (IOException e) { throw new RuntimeException(e); } } return currentIdIterator != null; } } private static abstract class ScoreDocsIterator extends PrefetchingIterator<ScoreDoc> { private final Iterator<ScoreDoc> iterator; private final int[] docStarts; private final LeafReaderContext[] contexts; protected ScoreDoc currentDoc; private ScoreDocsIterator(TopDocs docs, LeafReaderContext[] contexts) { this.contexts = contexts; this.iterator = new ArrayIterator<>(docs.scoreDocs); int segments = contexts.length; docStarts = new int[segments + 1]; for (int i = 0; i < segments; i++) { LeafReaderContext context = contexts[i]; docStarts[i] = context.docBase; } LeafReaderContext lastContext = contexts[segments - 1]; docStarts[segments] = lastContext.docBase + lastContext.reader().maxDoc(); } public ScoreDoc getCurrentDoc() { return currentDoc; } @Override protected ScoreDoc fetchNextOrNull() { if (!iterator.hasNext()) { return null; } currentDoc = iterator.next(); int subIndex = ReaderUtil.subIndex(currentDoc.doc, docStarts); LeafReaderContext context = contexts[subIndex]; onNextDoc(currentDoc.doc - context.docBase, context); return currentDoc; } protected abstract void onNextDoc(int localDocID, LeafReaderContext context); } private static final class TopDocsIterator extends AbstractIndexHits<Document> { private final int size; private final ScoreDocsIterator scoreDocs; private Document currentDoc; private TopDocsIterator(TopDocs docs, LeafReaderContext[] contexts) { scoreDocs = new ScoreDocsIterator(docs, contexts) { @Override protected void onNextDoc(int localDocID, LeafReaderContext context) { updateCurrentDocument(localDocID, context.reader()); } }; this.size = docs.scoreDocs.length; } public int size() { return size; } @Override public float currentScore() { return scoreDocs.getCurrentDoc().score; } @Override protected Document fetchNextOrNull() { if (!scoreDocs.hasNext()) { return null; } scoreDocs.next(); return currentDoc; } private void updateCurrentDocument(int docID, LeafReader reader) { try { currentDoc = reader.document(docID); } catch (IOException e) { throw new RuntimeException(e); } } } private static final class TopDocsValuesIterator extends PrimitiveLongCollections.PrimitiveLongBaseIterator { private final ScoreDocsIterator scoreDocs; private final String field; private Map<LeafReaderContext, NumericDocValues> docValuesCache; private long currentValue; public TopDocsValuesIterator(TopDocs docs, LeafReaderContext[] contexts, String field) { this.field = field; docValuesCache = new HashMap<>(contexts.length); scoreDocs = new ScoreDocsIterator(docs, contexts) { @Override protected void onNextDoc(int localDocID, LeafReaderContext context) { loadNextValue(context, localDocID); } }; } @Override protected boolean fetchNext() { if (scoreDocs.hasNext()) { scoreDocs.next(); return currentValue != -1 && next(currentValue); } return false; } private void loadNextValue(LeafReaderContext context, int docID) { NumericDocValues docValues; if (docValuesCache.containsKey(context)) { docValues = docValuesCache.get(context); } else { try { docValues = context.reader().getNumericDocValues(field); docValuesCache.put(context, docValues); } catch (IOException e) { throw new RuntimeException(e); } } if (docValues != null) { currentValue = docValues.get(docID); } else { currentValue = -1; } } } }