Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.searcher; import org.apache.lucene.search.*; import org.apache.lucene.search.queries.PwaSortQuery; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.QueryFilter; import org.apache.lucene.index.Term; import org.apache.lucene.misc.ChainedFilter; import org.apache.nutch.searcher.DistributedSearch; import org.apache.nutch.global.Global; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import java.util.LinkedHashMap; import java.util.Map; import java.util.ArrayList; import java.io.IOException; /** Utility which converts certain query clauses into {@link QueryFilter}s and * caches these. Only required clauses whose boost is zero are converted to * cached filters. Range queries are converted to range filters. This * accellerates query constraints like date, language, document format, etc., * which do not affect ranking but might otherwise slow search considerably. */ class LuceneQueryOptimizer { public static final Log LOG = LogFactory.getLog(LuceneQueryOptimizer.class); // This thread provides a pseudo-clock service to all searching // threads, so that they can count elapsed time with less overhead than // repeatedly calling System.currentTimeMillis. private TimerThread timerThread = null; private static class TimerThread extends Thread { private int tick; // NOTE: we can avoid explicit synchronization here for several reasons: // * updates to 32-bit-sized variables are atomic // * only single thread modifies this value // * use of volatile keyword ensures that it does not reside in // a register, but in main memory (so that changes are visible to // other threads). // * visibility of changes does not need to be instantanous, we can // afford losing a tick or two. // // See section 17 of the Java Language Specification for details. public volatile int timeCounter = 0; boolean running = true; public TimerThread(int tick) { super("LQO timer thread"); this.tick = tick; this.setDaemon(true); } public void run() { while (running) { timeCounter++; try { Thread.sleep(tick); } catch (InterruptedException ie) { // ignore } ; } } } private void initTimerThread(int p) { if (timerThread == null || !timerThread.isAlive()) { timerThread = new TimerThread(p); timerThread.start(); } } private static class TimeExceeded extends RuntimeException { public long maxTime; private int maxDoc; public TimeExceeded(long maxTime, int maxDoc) { super("Exceeded search time: " + maxTime + " ms."); this.maxTime = maxTime; this.maxDoc = maxDoc; } } private static class LimitedCollector extends TopDocCollector { private int maxHits; private int maxTicks; private int startTicks; private TimerThread timer; private int curTicks; public LimitedCollector(int numHits, int maxHits, int maxTicks, TimerThread timer, boolean reverse) { super(numHits, reverse); this.maxHits = maxHits; this.maxTicks = maxTicks; if (timer != null) { this.timer = timer; this.startTicks = timer.timeCounter; } } public void collect(int doc, float score) { if (maxHits > 0 && getTotalHits() >= maxHits) { throw new LimitExceeded(doc); } if (timer != null) { curTicks = timer.timeCounter; // overflow check if (curTicks < startTicks) curTicks += Integer.MAX_VALUE; if (curTicks - startTicks > maxTicks) { throw new TimeExceeded(timer.tick * (curTicks - startTicks), doc); } } super.collect(doc, score); } } private static class LimitExceeded extends RuntimeException { private int maxDoc; public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; } } private float threshold; private int maxFulltextMatchesRanked; private int tickLength; private int maxTickCount; private int timeoutResponse; private String cacheType; /** * Construct an optimizer that caches and uses filters for required clauses * whose boost is zero. * * @param cacheSize * the number of QueryFilters to cache * @param threshold * the fraction of documents which must contain a term */ public LuceneQueryOptimizer(Configuration conf) { final int cacheSize = conf.getInt("searcher.filter.cache.size", 16); this.threshold = conf.getFloat("searcher.filter.cache.threshold", 0.05f); this.tickLength = conf.getInt("searcher.max.time.tick_length", 200); this.maxTickCount = conf.getInt("searcher.max.time.tick_count", -1); this.maxFulltextMatchesRanked = conf.getInt(Global.MAX_FULLTEXT_MATCHES_RANKED, -1); this.timeoutResponse = conf.getInt(Global.TIMEOUT_INDEX_SERVERS_RESPONSE, -1); if (timeoutResponse > 0) { this.maxTickCount = timeoutResponse; this.tickLength = 1000; } if (this.maxTickCount > 0) { initTimerThread(this.tickLength); } } public TopDocs optimize(BooleanQuery original, Searcher searcher, int numHits, String sortField, boolean reverse) throws IOException { BooleanQuery query = new BooleanQuery(); Filter filter = null; BooleanClause[] clauses = original.getClauses(); for (int i = 0; i < clauses.length; i++) { BooleanClause c = clauses[i]; if (c.isRequired() && c.getQuery().getBoost() == 0.0f) { // boost is zero if (c.getQuery() instanceof TermQuery // TermQuery && (searcher.docFreq(((TermQuery) c.getQuery()).getTerm()) / (float) searcher.maxDoc()) < threshold) { // beneath threshold query.add(c); } else if (c.getQuery() instanceof RangeQuery) { // RangeQuery query.add(c); } } else { query.add(c); // query it } } query.setFunctions(original.getFunctions()); if (sortField != null) { // to sort result by sortField query.add(new PwaSortQuery(sortField, reverse), BooleanClause.Occur.MUST); } // print query LOG.info("Query:" + query.toString()); // no hit limit if (this.maxFulltextMatchesRanked <= 0 && timerThread == null) { return searcher.search(query, filter, numHits); } // hits limited in time or in count -- use a LimitedCollector LimitedCollector collector = new LimitedCollector(numHits, maxFulltextMatchesRanked, maxTickCount, timerThread, (sortField != null) ? !reverse : reverse); LimitExceeded exceeded = null; TimeExceeded timeExceeded = null; try { searcher.search(query, filter, collector); } catch (LimitExceeded le) { exceeded = le; } catch (TimeExceeded te) { timeExceeded = te; } TopDocs results = collector.topDocs(); if (exceeded != null) { // limit was exceeded results.totalHits = (int) (results.totalHits * (searcher.maxDoc() / (float) exceeded.maxDoc)); // estimate totalHits } else if (timeExceeded != null) { results.totalHits = (int) (results.totalHits * (searcher.maxDoc() / (float) timeExceeded.maxDoc)); } return results; } /** * @param numHits number of top results * @param maxFulltextMatchesRanked number of matched documents for ranking */ public TopDocs optimize(BooleanQuery original, Searcher searcher, int numHits, int maxFulltextMatchesRanked, String sortField, boolean reverse) throws IOException { if (maxFulltextMatchesRanked != NutchBean.MATCHED_DOCS_CONST_IGNORE) { this.maxFulltextMatchesRanked = maxFulltextMatchesRanked; } return optimize(original, searcher, numHits, sortField, reverse); } }