Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; import org.apache.lucene.index.Impact; import org.apache.lucene.index.Impacts; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SlowImpactsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermStates; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; /** * A query that treats multiple terms as synonyms. * <p> * For scoring purposes, this query tries to score the terms as if you * had indexed them as one term: it will match any of the terms but * only invoke the similarity a single time, scoring the sum of all * term frequencies for the document. */ public final class SynonymQuery extends Query { private final TermAndBoost terms[]; private final String field; /** * A builder for {@link SynonymQuery}. */ public static class Builder { private final String field; private final List<TermAndBoost> terms = new ArrayList<>(); /** * Sole constructor * * @param field The target field name */ public Builder(String field) { this.field = field; } /** * Adds the provided {@code term} as a synonym. */ public Builder addTerm(Term term) { return addTerm(term, 1f); } /** * Adds the provided {@code term} as a synonym, document frequencies of this term * will be boosted by {@code boost}. */ public Builder addTerm(Term term, float boost) { if (field.equals(term.field()) == false) { throw new IllegalArgumentException("Synonyms must be across the same field"); } if (Float.isNaN(boost) || Float.compare(boost, 0f) <= 0 || Float.compare(boost, 1f) > 0) { throw new IllegalArgumentException( "boost must be a positive float between 0 (exclusive) and 1 (inclusive)"); } terms.add(new TermAndBoost(term, boost)); if (terms.size() > BooleanQuery.getMaxClauseCount()) { throw new BooleanQuery.TooManyClauses(); } return this; } /** * Builds the {@link SynonymQuery}. */ public SynonymQuery build() { Collections.sort(terms); return new SynonymQuery(terms.toArray(new TermAndBoost[0]), field); } } /** * Creates a new SynonymQuery, matching any of the supplied terms. * <p> * The terms must all have the same field. * * @deprecated Please use a {@link Builder} instead. */ @Deprecated public SynonymQuery(Term... terms) { Objects.requireNonNull(terms); if (terms.length > BooleanQuery.getMaxClauseCount()) { throw new BooleanQuery.TooManyClauses(); } this.terms = new TermAndBoost[terms.length]; // check that all terms are the same field String field = null; for (int i = 0; i < terms.length; i++) { Term term = terms[i]; this.terms[i] = new TermAndBoost(term, 1.0f); if (field == null) { field = term.field(); } else if (!term.field().equals(field)) { throw new IllegalArgumentException("Synonyms must be across the same field"); } } Arrays.sort(this.terms); this.field = field; } /** * Creates a new SynonymQuery, matching any of the supplied terms. * <p> * The terms must all have the same field. */ private SynonymQuery(TermAndBoost[] terms, String field) { this.terms = Objects.requireNonNull(terms); this.field = field; } public List<Term> getTerms() { return Collections .unmodifiableList(Arrays.stream(terms).map(TermAndBoost::getTerm).collect(Collectors.toList())); } @Override public String toString(String field) { StringBuilder builder = new StringBuilder("Synonym("); for (int i = 0; i < terms.length; i++) { if (i != 0) { builder.append(" "); } Query termQuery = new TermQuery(terms[i].term); builder.append(termQuery.toString(field)); if (terms[i].boost != 1f) { builder.append("^"); builder.append(terms[i].boost); } } builder.append(")"); return builder.toString(); } @Override public int hashCode() { return 31 * classHash() + Arrays.hashCode(terms); } @Override public boolean equals(Object other) { return sameClassAs(other) && Arrays.equals(terms, ((SynonymQuery) other).terms); } @Override public Query rewrite(IndexReader reader) throws IOException { // optimize zero and single term cases if (terms.length == 0) { return new BooleanQuery.Builder().build(); } if (terms.length == 1) { return terms[0].boost == 1f ? new TermQuery(terms[0].term) : new BoostQuery(new TermQuery(terms[0].term), terms[0].boost); } return this; } @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field) == false) { return; } QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.SHOULD, this); Term[] ts = Arrays.stream(terms).map(t -> t.term).toArray(Term[]::new); v.consumeTerms(this, ts); } @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { if (scoreMode.needsScores()) { return new SynonymWeight(this, searcher, scoreMode, boost); } else { // if scores are not needed, let BooleanWeight deal with optimizing that case. BooleanQuery.Builder bq = new BooleanQuery.Builder(); for (TermAndBoost term : terms) { bq.add(new TermQuery(term.term), BooleanClause.Occur.SHOULD); } return searcher.rewrite(bq.build()).createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); } } class SynonymWeight extends Weight { private final TermStates termStates[]; private final Similarity similarity; private final Similarity.SimScorer simWeight; private final ScoreMode scoreMode; SynonymWeight(Query query, IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { super(query); assert scoreMode.needsScores(); this.scoreMode = scoreMode; CollectionStatistics collectionStats = searcher.collectionStatistics(terms[0].term.field()); long docFreq = 0; long totalTermFreq = 0; termStates = new TermStates[terms.length]; for (int i = 0; i < termStates.length; i++) { TermStates ts = TermStates.build(searcher.getTopReaderContext(), terms[i].term, true); termStates[i] = ts; if (ts.docFreq() > 0) { TermStatistics termStats = searcher.termStatistics(terms[i].term, ts.docFreq(), ts.totalTermFreq()); docFreq = Math.max(termStats.docFreq(), docFreq); totalTermFreq += termStats.totalTermFreq(); } } this.similarity = searcher.getSimilarity(); if (docFreq > 0) { TermStatistics pseudoStats = new TermStatistics(new BytesRef("synonym pseudo-term"), docFreq, totalTermFreq); this.simWeight = similarity.scorer(boost, collectionStats, pseudoStats); } else { this.simWeight = null; // no terms exist at all, we won't use similarity } } @Override public void extractTerms(Set<Term> terms) { for (TermAndBoost term : SynonymQuery.this.terms) { terms.add(term.term); } } @Override public Matches matches(LeafReaderContext context, int doc) throws IOException { String field = terms[0].term.field(); Terms indexTerms = context.reader().terms(field); if (indexTerms == null || indexTerms.hasPositions() == false) { return super.matches(context, doc); } List<Term> termList = Arrays.stream(terms).map(TermAndBoost::getTerm).collect(Collectors.toList()); return MatchesUtils.forField(field, () -> DisjunctionMatchesIterator.fromTerms(context, doc, getQuery(), field, termList)); } @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context); if (scorer != null) { int newDoc = scorer.iterator().advance(doc); if (newDoc == doc) { final float freq; if (scorer instanceof SynonymScorer) { freq = ((SynonymScorer) scorer).freq(); } else if (scorer instanceof FreqBoostTermScorer) { freq = ((FreqBoostTermScorer) scorer).freq(); } else { assert scorer instanceof TermScorer; freq = ((TermScorer) scorer).freq(); } LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), terms[0].term.field(), true); Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq); Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); return Explanation.match(scoreExplanation.getValue(), "weight(" + getQuery() + " in " + doc + ") [" + similarity.getClass().getSimpleName() + "], result of:", scoreExplanation); } } return Explanation.noMatch("no matching term"); } @Override public Scorer scorer(LeafReaderContext context) throws IOException { List<PostingsEnum> iterators = new ArrayList<>(); List<ImpactsEnum> impacts = new ArrayList<>(); List<Float> termBoosts = new ArrayList<>(); for (int i = 0; i < terms.length; i++) { TermState state = termStates[i].get(context); if (state != null) { TermsEnum termsEnum = context.reader().terms(terms[i].term.field()).iterator(); termsEnum.seekExact(terms[i].term.bytes(), state); if (scoreMode == ScoreMode.TOP_SCORES) { ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); iterators.add(impactsEnum); impacts.add(impactsEnum); } else { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); iterators.add(postingsEnum); impacts.add(new SlowImpactsEnum(postingsEnum)); } termBoosts.add(terms[i].boost); } } if (iterators.isEmpty()) { return null; } LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), terms[0].term.field(), true); // we must optimize this case (term not in segment), disjunctions require >= 2 subs if (iterators.size() == 1) { final TermScorer scorer; if (scoreMode == ScoreMode.TOP_SCORES) { scorer = new TermScorer(this, impacts.get(0), simScorer); } else { scorer = new TermScorer(this, iterators.get(0), simScorer); } float boost = termBoosts.get(0); return scoreMode == ScoreMode.COMPLETE_NO_SCORES || boost == 1f ? scorer : new FreqBoostTermScorer(boost, scorer, simScorer); } // we use termscorers + disjunction as an impl detail DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size()); for (int i = 0; i < iterators.size(); i++) { PostingsEnum postings = iterators.get(i); final TermScorer termScorer = new TermScorer(this, postings, simScorer); float boost = termBoosts.get(i); final DisiWrapperFreq wrapper = new DisiWrapperFreq(termScorer, boost); queue.add(wrapper); } // Even though it is called approximation, it is accurate since none of // the sub iterators are two-phase iterators. DocIdSetIterator iterator = new DisjunctionDISIApproximation(queue); float[] boosts = new float[impacts.size()]; for (int i = 0; i < boosts.length; i++) { boosts[i] = termBoosts.get(i); } ImpactsSource impactsSource = mergeImpacts(impacts.toArray(new ImpactsEnum[0]), boosts); ImpactsDISI impactsDisi = new ImpactsDISI(iterator, impactsSource, simScorer.getSimScorer()); if (scoreMode == ScoreMode.TOP_SCORES) { iterator = impactsDisi; } return new SynonymScorer(this, queue, iterator, impactsDisi, simScorer); } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } } /** * Merge impacts for multiple synonyms. */ static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums, float[] boosts) { assert impactsEnums.length == boosts.length; return new ImpactsSource() { class SubIterator { final Iterator<Impact> iterator; int previousFreq; Impact current; SubIterator(Iterator<Impact> iterator) { this.iterator = iterator; this.current = iterator.next(); } void next() { previousFreq = current.freq; if (iterator.hasNext() == false) { current = null; } else { current = iterator.next(); } } } @Override public Impacts getImpacts() throws IOException { final Impacts[] impacts = new Impacts[impactsEnums.length]; // Use the impacts that have the lower next boundary as a lead. // It will decide on the number of levels and the block boundaries. Impacts tmpLead = null; for (int i = 0; i < impactsEnums.length; ++i) { impacts[i] = impactsEnums[i].getImpacts(); if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { tmpLead = impacts[i]; } } final Impacts lead = tmpLead; return new Impacts() { @Override public int numLevels() { // Delegate to the lead return lead.numLevels(); } @Override public int getDocIdUpTo(int level) { // Delegate to the lead return lead.getDocIdUpTo(level); } /** * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, * or {@code -1} if there is no such level. */ private int getLevel(Impacts impacts, int docIdUpTo) { for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { if (impacts.getDocIdUpTo(level) >= docIdUpTo) { return level; } } return -1; } @Override public List<Impact> getImpacts(int level) { final int docIdUpTo = getDocIdUpTo(level); List<List<Impact>> toMerge = new ArrayList<>(); for (int i = 0; i < impactsEnums.length; ++i) { if (impactsEnums[i].docID() <= docIdUpTo) { int impactsLevel = getLevel(impacts[i], docIdUpTo); if (impactsLevel == -1) { // One instance doesn't have impacts that cover up to docIdUpTo // Return impacts that trigger the maximum score return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); } final List<Impact> impactList; if (boosts[i] != 1f) { float boost = boosts[i]; impactList = impacts[i].getImpacts(impactsLevel).stream().map( impact -> new Impact((int) Math.ceil(impact.freq * boost), impact.norm)) .collect(Collectors.toList()); } else { impactList = impacts[i].getImpacts(impactsLevel); } toMerge.add(impactList); } } assert toMerge.size() > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong if (toMerge.size() == 1) { // common if one synonym is common and the other one is rare return toMerge.get(0); } PriorityQueue<SubIterator> pq = new PriorityQueue<SubIterator>(impacts.length) { @Override protected boolean lessThan(SubIterator a, SubIterator b) { if (a.current == null) { // means iteration is finished return false; } if (b.current == null) { return true; } return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; } }; for (List<Impact> impacts : toMerge) { pq.add(new SubIterator(impacts.iterator())); } List<Impact> mergedImpacts = new ArrayList<>(); // Idea: merge impacts by norm. The tricky thing is that we need to // consider norm values that are not in the impacts too. For // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], // there might well be a document that has a freq of 2 and a length of 11, // which was just not added to the list of impacts because {freq=2,norm=10} // is more competitive. So the way it works is that we track the sum of // the term freqs that we have seen so far in order to account for these // implicit impacts. long sumTf = 0; SubIterator top = pq.top(); do { final long norm = top.current.norm; do { sumTf += top.current.freq - top.previousFreq; top.next(); top = pq.updateTop(); } while (top.current != null && top.current.norm == norm); final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); if (mergedImpacts.isEmpty()) { mergedImpacts.add(new Impact(freqUpperBound, norm)); } else { Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); assert Long.compareUnsigned(prevImpact.norm, norm) < 0; if (freqUpperBound > prevImpact.freq) { mergedImpacts.add(new Impact(freqUpperBound, norm)); } // otherwise the previous impact is already more competitive } } while (top.current != null); return mergedImpacts; } }; } @Override public void advanceShallow(int target) throws IOException { for (ImpactsEnum impactsEnum : impactsEnums) { if (impactsEnum.docID() < target) { impactsEnum.advanceShallow(target); } } } }; } private static class SynonymScorer extends Scorer { private final DisiPriorityQueue queue; private final DocIdSetIterator iterator; private final ImpactsDISI impactsDisi; private final LeafSimScorer simScorer; SynonymScorer(Weight weight, DisiPriorityQueue queue, DocIdSetIterator iterator, ImpactsDISI impactsDisi, LeafSimScorer simScorer) { super(weight); this.queue = queue; this.iterator = iterator; this.impactsDisi = impactsDisi; this.simScorer = simScorer; } @Override public int docID() { return iterator.docID(); } float freq() throws IOException { DisiWrapperFreq w = (DisiWrapperFreq) queue.topList(); float freq = w.freq(); for (w = (DisiWrapperFreq) w.next; w != null; w = (DisiWrapperFreq) w.next) { freq += w.freq(); } return freq; } @Override public float score() throws IOException { return simScorer.score(iterator.docID(), freq()); } @Override public DocIdSetIterator iterator() { return iterator; } @Override public float getMaxScore(int upTo) throws IOException { return impactsDisi.getMaxScore(upTo); } @Override public int advanceShallow(int target) throws IOException { return impactsDisi.advanceShallow(target); } @Override public void setMinCompetitiveScore(float minScore) { impactsDisi.setMinCompetitiveScore(minScore); } } private static class DisiWrapperFreq extends DisiWrapper { final PostingsEnum pe; final float boost; DisiWrapperFreq(Scorer scorer, float boost) { super(scorer); this.pe = (PostingsEnum) scorer.iterator(); this.boost = boost; } float freq() throws IOException { return boost * pe.freq(); } } private static class FreqBoostTermScorer extends FilterScorer { final float boost; final TermScorer in; final LeafSimScorer docScorer; public FreqBoostTermScorer(float boost, TermScorer in, LeafSimScorer docScorer) { super(in); if (Float.isNaN(boost) || Float.compare(boost, 0f) < 0 || Float.compare(boost, 1f) > 0) { throw new IllegalArgumentException( "boost must be a positive float between 0 (exclusive) and 1 (inclusive)"); } this.boost = boost; this.in = in; this.docScorer = docScorer; } float freq() throws IOException { return boost * in.freq(); } @Override public float score() throws IOException { assert docID() != DocIdSetIterator.NO_MORE_DOCS; return docScorer.score(in.docID(), freq()); } @Override public float getMaxScore(int upTo) throws IOException { return in.getMaxScore(upTo); } @Override public int advanceShallow(int target) throws IOException { return in.advanceShallow(target); } @Override public void setMinCompetitiveScore(float minScore) throws IOException { in.setMinCompetitiveScore(minScore); } } private static class TermAndBoost implements Comparable<TermAndBoost> { final Term term; final float boost; TermAndBoost(Term term, float boost) { this.term = term; this.boost = boost; } Term getTerm() { return term; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; TermAndBoost that = (TermAndBoost) o; return Float.compare(that.boost, boost) == 0 && Objects.equals(term, that.term); } @Override public int hashCode() { return Objects.hash(term, boost); } @Override public int compareTo(TermAndBoost o) { return term.compareTo(o.term); } } }