Java tutorial
/** * Copyright 2011 Pablo Mendes, Max Jakob * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dbpedia.spotlight.lucene.search; import com.google.common.collect.Ordering; import com.google.common.primitives.Ints; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.MapFieldSelector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; import org.dbpedia.spotlight.exceptions.SearchException; import org.dbpedia.spotlight.exceptions.TimeoutException; import org.dbpedia.spotlight.lucene.LuceneFeatureVector; import org.dbpedia.spotlight.lucene.LuceneManager; import org.dbpedia.spotlight.model.*; import org.dbpedia.spotlight.model.vsm.FeatureVector; import java.io.Closeable; import java.io.IOException; import java.util.*; /** * This class manages an index from surface form to candidate resources (surrogates) * @author pablomendes */ public class BaseSearcher implements Closeable { protected final Log LOG = LogFactory.getLog(getClass()); LuceneManager mLucene; IndexSearcher mSearcher; public IndexReader mReader; //TODO create method that iterates over all documents in the index and computes this. (if takes too long, think about storing somewhere at indexing time) private double mNumberOfOccurrences = 69772256; protected BaseSearcher() { } // allow subclasses to rewrite the super below public long objectCreationTime = 0; public BaseSearcher(LuceneManager lucene) throws IOException { this.mLucene = lucene; LOG.info("Using index at: " + this.mLucene.mContextIndexDir); LOG.debug("Opening IndexSearcher and IndexReader for Lucene directory " + this.mLucene.mContextIndexDir + " ..."); this.mReader = IndexReader.open(this.mLucene.mContextIndexDir, true); // read-only=true this.mSearcher = new IndexSearcher(this.mReader); LOG.debug("Done."); //If the DBpediaResourceFactory uses SQL, the mapping from Lucene docID to URI must be cached: if (mLucene.getDBpediaResourceFactory() instanceof DBpediaResourceFactorySQL) { /* Here, the Cache for the URIs is called (but not read) for the first time and will be created from the index and held within the Lucene cache manager. */ LOG.debug("Caching all URIs"); FieldCache.DEFAULT.getStrings(mReader, LuceneManager.DBpediaResourceField.URI.toString()); LOG.debug("Done."); } } public int getNumberOfEntries() { return this.mReader.numDocs(); // can use maxDoc? } public double getNumberOfOccurrences() { return this.mNumberOfOccurrences; } /** * Generic search method that will be used by other public methods in this class * @param query * @return * @throws IOException * @throws ParseException */ protected List<FeatureVector> search(Query query) throws SearchException { // List of results List<FeatureVector> surrogates = new ArrayList<FeatureVector>(); // Iterate through the results: for (ScoreDoc hit : getHits(query)) { DBpediaResource resource = getDBpediaResource(hit.doc); TermFreqVector vector = getVector(hit.doc); surrogates.add(new LuceneFeatureVector(resource, vector)); } //LOG.debug(surrogates.size()+" hits found."); return surrogates; } public boolean isDeleted(int docNo) { return mReader.isDeleted(docNo); } public Document getFullDocument(int docNo) throws SearchException { Document document; try { document = mReader.document(docNo); } catch (IOException e) { throw new SearchException("Error reading document " + docNo, e); } return document; } public Document getDocument(int docNo, FieldSelector selector) throws SearchException { Document document; try { document = mReader.document(docNo, selector); //document = mReader.document(docNo, fieldSelector); } catch (IOException e) { throw new SearchException("Error reading document " + docNo, e); } //LOG.debug("docNo:"+docNo); return document; } public List<Document> getDocuments(DBpediaResource res, FieldSelector fieldSelector) throws SearchException { //LOG.trace("Retrieving documents for resource: "+res); // search index for surface form List<Document> documents = new ArrayList<Document>(); // Iterate through the results: for (ScoreDoc hit : getHits(mLucene.getQuery(res))) { documents.add(getDocument(hit.doc, fieldSelector)); } //LOG.debug(documents.size()+" documents found."); // return set of surrogates return documents; } /** * Basic search method used by all searches to the index. * @param query * @param n * @return * @throws SearchException */ public ScoreDoc[] getHits(Query query, int n, int timeout, Filter filter) throws SearchException { ScoreDoc[] hits = null; try { //LOG.debug("Start search. timeout="+timeout); long start = System.nanoTime(); TopScoreDocCollector collector = TopScoreDocCollector.create(n, false); //TimeLimitingCollector collector = new TimeLimitingCollector(tCollector, timeout); //TODO try to bring this back later mSearcher.search(query, filter, collector); //mSearcher. hits = collector.topDocs().scoreDocs; long end = System.nanoTime(); LOG.debug( String.format("Done search in %f ms. hits.length=%d", (end - start) / 1000000.0, hits.length)); } catch (TimeLimitingCollector.TimeExceededException timedOutException) { throw new TimeoutException("Timeout (>" + timeout + "ms searching for surface form " + query.toString(), timedOutException); } catch (Exception e) { throw new SearchException("Error searching for surface form " + query.toString(), e); } //LOG.debug(hits.length+" hits found."); return hits; } /** * Search method with default filter * @param query * @param n number of results to return * @param timeout number of miliseconds before giving up this query * @return array of document id,score * @throws SearchException */ public ScoreDoc[] getHits(Query query, int n, int timeout) throws SearchException { Filter filter = null; //TODO surfaceForm filter here? return getHits(query, n, timeout, filter); } // Uses default timeout public ScoreDoc[] getHits(Query query, int n) throws SearchException { return getHits(query, n, 5000); } // Uses default maxHits and timeout public ScoreDoc[] getHits(Query query) throws SearchException { return getHits(query, mLucene.topResultsLimit()); } /** * Retrieves all DBpedia Resources in the index that are within a set of allowed URIs and match the input text. * Uses Lucene's MoreLikeThis rather than ICF as defined in DBpedia Spotlight's paper. * It is faster, but doesn't take into account selectional preferences of words wrt resources. * * @param context text containing a URI mention * @param resources allowed URIs * @return */ public ScoreDoc[] getHits(Text context, Set<DBpediaResource> resources) throws SearchException { try { return getHits(mLucene.getQuery(context, resources, this.mReader)); } catch (IOException e) { throw new SearchException("Error while executing query. ", e); } } public TermFreqVector getVector(int docNo) throws SearchException { TermFreqVector vector = null; try { vector = mReader.getTermFreqVector(docNo, LuceneManager.DBpediaResourceField.CONTEXT.toString()); if (vector == null) throw new IllegalStateException("TermFreqVector for document " + docNo + " is null."); } catch (IOException e) { throw new SearchException("Error reading TermFreqVector for surrogate " + docNo, e); } //LOG.debug("vector:"+vector); return vector; } @Override public void close() throws IOException { //LOG.debug("Closing searcher."); mSearcher.close(); //mReader.close(); } /** * Creates a DBpediaResource with all fields stored in the index * @deprecated Use getDBpediaResource(int docNo, String[] fieldsToLoad) * TODO REMOVE. This is bad because the fields being loaded are opaque to the caller. Some need only URI, others need everything. * @param docNo * @return */ public DBpediaResource getDBpediaResource(int docNo) throws SearchException { DBpediaResource r = null; String method = ""; long start = System.nanoTime(); DBpediaResourceFactory f = mLucene.getDBpediaResourceFactory(); if (f == null) { method = "lucene"; String[] fields = { LuceneManager.DBpediaResourceField.TYPE.toString(), LuceneManager.DBpediaResourceField.URI.toString(), LuceneManager.DBpediaResourceField.URI_COUNT.toString() }; r = getDBpediaResource(docNo, fields); // load all available info from Lucene } else { method = "database"; //String[] fields = {LuceneManager.DBpediaResourceField.URI.toString()}; //r = getDBpediaResource(docNo,fields); // load only URI from Lucene r = getCachedDBpediaResource(docNo); // load URI from Lucene's Cache r = mLucene.getDBpediaResourceFactory().from(r.uri()); // load the rest of the info from DB } long end = System.nanoTime(); //LOG.debug(String.format("DBpediaResource (%s) creation with %s took %f ms.", r.uri(), method, (end-start) / 1000000.0) ); objectCreationTime += (end - start); return r; } /** * This is an experimental function to evaluate the feasibility of caching all URIs in Lucene * @param docNo * @return * @throws SearchException */ public DBpediaResource getCachedDBpediaResource(int docNo) throws SearchException { try { String[] uris = FieldCache.DEFAULT.getStrings(mReader, LuceneManager.DBpediaResourceField.URI.toString()); return new DBpediaResource(uris[docNo]); } catch (IOException e) { throw new SearchException("Error getting cached DBpediaResource.", e); } } /** * CONTEXT SEARCHER and SURROGATE SEARCHER * Loads only a few fields (faster) * TODO FACTORY move to Factory * @param docNo * @return * @throws SearchException */ public DBpediaResource getDBpediaResource(int docNo, String[] fieldsToLoad) throws SearchException { FieldSelector fieldSelector = new MapFieldSelector(fieldsToLoad); Document document = getDocument(docNo, fieldSelector); Field uriField = document.getField(LuceneManager.DBpediaResourceField.URI.toString()); if (uriField == null) throw new SearchException("Cannot find URI for document " + document); String uri = uriField.stringValue(); if (uri == null) throw new SearchException("Cannot find URI for document " + document); DBpediaResource resource = new DBpediaResource(uri); for (String fieldName : fieldsToLoad) { Field field = document.getField(fieldName); if (field != null) Factory.setField(resource, LuceneManager.DBpediaResourceField.valueOf(fieldName), document); } if (resource.prior() == 0.0) { // adjust prior resource.setPrior(resource.support() / this.getNumberOfOccurrences()); } return resource; } // // Returns the first URI that can be found in the document number docNo (old, superceded by Factory.setField) // //TODO move to Factory // //TODO why is this overriding BaseSearcher? can merge? // public DBpediaResource getDBpediaResource(int docNo) throws SearchException { // // FieldSelector fieldSelector = new MapFieldSelector(onlyUriAndTypes); // // LOG.trace("Getting document number " + docNo + "..."); // Document document = createDocument(docNo, fieldSelector); // String uri = document.get(LuceneManager.DBpediaResourceField.URI.toString()); // if (uri==null) // throw new SearchException("Cannot find URI for document "+document); // // LOG.trace("Setting URI, types and support..."); // DBpediaResource resource = new DBpediaResource(uri); // resource.setTypes( getDBpediaTypes(document) ); // resource.setSupport( getSupport(document) ); //TODO this can be optimized for time performance by adding a support field. (search for the most likely URI then becomes a bit more complicated) // // //LOG.debug("uri:"+uri); // return resource; // } public SurfaceForm getSurfaceForm(int docNo) throws SearchException { String[] onlyUriAndTypes = { LuceneManager.DBpediaResourceField.SURFACE_FORM.toString() }; FieldSelector fieldSelector = new MapFieldSelector(onlyUriAndTypes); Document document = getDocument(docNo, fieldSelector); Field sfField = document.getField(LuceneManager.DBpediaResourceField.SURFACE_FORM.toString()); if (sfField == null) throw new SearchException("Cannot find SurfaceForm for document " + document); String sf = sfField.stringValue(); if (sf == null) throw new SearchException("Cannot find URI for document " + document); //LOG.debug("uri:"+uri); return new SurfaceForm(sf); } public boolean isContainedInIndex(SurfaceForm sf) { try { if (getHits(mLucene.getQuery(sf), 1).length > 0) return true; } catch (SearchException e) { LOG.info("SearchException in isContainedInIndex(" + sf + "): " + e); } return false; } // public Document termDocs(DBpediaResource resource) throws IOException { // // Term uriTerm = new Term(LuceneManager.DBpediaResourceField.URI.toString(),resource.uri()); // mReader.terms(uriTerm); // TermDocs t = mReader.termDocs(); // // return null; // } /** * Computes a term frequency map for the index at the specified location. * @param * @return a Boolean OR query. * @throws Exception if one is thrown. * @author sujitpal (computeTopTermQuery in http://sujitpal.blogspot.com/2009/02/summarization-with-lucene.html) * @author pablomendes adapted from sujitpal */ public static List<Map.Entry<Term, Integer>> getTopTerms(IndexReader mReader) throws IOException { final Map<Term, Integer> frequencyMap = new HashMap<Term, Integer>(); TermEnum terms = mReader.terms(); //TODO check what can we do about fields here. should have only top terms for context field? while (terms.next()) { Term term = terms.term(); int frequency = mReader.docFreq(term); // DF frequencyMap.put(term, frequency); } // sort the term map by frequency descending Ordering descOrder = new Ordering<Map.Entry<Term, Integer>>() { public int compare(Map.Entry<Term, Integer> left, Map.Entry<Term, Integer> right) { return Ints.compare(right.getValue(), left.getValue()); } }; List<Map.Entry<Term, Integer>> sorted = descOrder.sortedCopy(frequencyMap.entrySet()); return sorted; } /** * Warms up the index with the n most common terms. * * TODO warm up property with surface form + context * Currently this only gets the most common terms (which are all probably from CONTEXT field, so not much gain is seen for the ICF * Best would be to get a list of X most common surface forms and execute a query with that surface form and the top Y most common context terms * @param n * @return * @throws IOException */ public void warmUp(int n) { try { long start = System.nanoTime(); List<Map.Entry<Term, Integer>> terms = getTopTerms(mReader); LOG.info(String.format("Index has %s terms. Will warm up cache with the %s top terms.", terms.size(), n)); for (int i = 0; i < n; i++) { Term t = terms.get(i).getKey(); //TODO For a second-level cache warmUp we need surface forms and context together, but this is app dependent // String[] commonSurfaceForms = {"*"}; //get a list of surface forms from somewhere // for (String sf: commonSurfaceForms) { // getHits(new CandidateResourceQuery(new Term(LuceneManager.DBpediaResourceField.SURFACE_FORM.toString(), sf), t)); // } //int count = terms.get(i).getValue(); //LOG.trace(String.format("Text: %s, Count: %s", t.text(), count)); getHits(new TermQuery(t), 3, 1000); // warm up first-level cache (lucene's own) } String time = new Long((System.nanoTime() - start) / 1000000000).toString(); //Files.write(Joiner.on("\n").join(terms), new File("/home/pablo/workspace/dbpa/trunk/src/web/topterms.tsv"), Charset.defaultCharset()); //TODO use one charset consistently throughout LOG.info(String.format("Warm up took %s ms.", time)); } catch (Exception e) { LOG.error("Error warming up the cache. Ignoring. "); //TODO Throw SetupException e.printStackTrace(); } } /** * for testing QueryAutoStopWordsAnalyzer * @return */ public IndexReader getIndexReader() { return mReader; } }