Java tutorial
/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright () 2007-2012 Stfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.input.index; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.apache.lucene.LucenePackage; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.input.source.InputStreamInputSource; import org.voyanttools.trombone.lucene.LuceneManager; import org.voyanttools.trombone.model.DocumentMetadata; import org.voyanttools.trombone.model.StoredDocumentSource; import org.voyanttools.trombone.model.TokenType; import org.voyanttools.trombone.storage.Storage; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.util.FlexibleParameters; /** * @author sgs * */ public class LuceneIndexer implements Indexer { private Storage storage; private FlexibleParameters parameters; public LuceneIndexer(Storage storage, FlexibleParameters parameters) { this.storage = storage; this.parameters = parameters; } public String index(List<StoredDocumentSource> storedDocumentSources) throws IOException { // let's check if we need to create new sources because of tokenization parameters if (parameters.getParameterValue("tokenization", "").isEmpty() == false) { StoredDocumentSourceStorage sourceDocumentSourceStorage = storage.getStoredDocumentSourceStorage(); String params = parameters.getParameterValue("tokenization"); for (int i = 0, len = storedDocumentSources.size(); i < len; i++) { StoredDocumentSource storedDocumentSource = storedDocumentSources.get(i); String id = storedDocumentSource.getId(); String newId = DigestUtils.md5Hex(id + params); InputStream inputStream = sourceDocumentSourceStorage.getStoredDocumentSourceInputStream(id); DocumentMetadata metadata = storedDocumentSource.getMetadata(); metadata.setLastTokenPositionIndex(TokenType.lexical, 0); // this is crucial to ensure that document is re-analyzed and metadata re-rewritten InputSource inputSource = new InputStreamInputSource(newId, metadata, inputStream); storedDocumentSources.set(i, sourceDocumentSourceStorage.getStoredDocumentSource(inputSource)); inputStream.close(); } } List<String> ids = new ArrayList<String>(); for (StoredDocumentSource storedDocumentSource : storedDocumentSources) { ids.add(storedDocumentSource.getId()); } String corpusId = storage.storeStrings(ids); // determine if we need to modify the Lucene index Collection<StoredDocumentSource> storedDocumentSourceForLucene = new ArrayList<StoredDocumentSource>(); if (storage.getLuceneManager().directoryExists()) { LeafReader reader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader()); Terms terms = reader.terms("id"); if (terms == null) { storedDocumentSourceForLucene.addAll(storedDocumentSources); } else { TermsEnum termsEnum = terms.iterator(); for (StoredDocumentSource storedDocumentSource : storedDocumentSources) { String id = storedDocumentSource.getId(); if (!termsEnum.seekExact(new BytesRef(id))) { storedDocumentSourceForLucene.add(storedDocumentSource); } } } } else { storedDocumentSourceForLucene.addAll(storedDocumentSources); } if (storedDocumentSourceForLucene.isEmpty() == false) { // index documents (or at least add corpus to document if not already there), we need to get a new writer IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter(); DirectoryReader indexReader = DirectoryReader.open(indexWriter, true); IndexSearcher indexSearcher = new IndexSearcher(indexReader); boolean verbose = parameters.getParameterBooleanValue("verbose"); int processors = Runtime.getRuntime().availableProcessors(); ExecutorService executor; // index executor = Executors.newFixedThreadPool(processors); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) { Runnable worker = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher, storedDocumentSource, corpusId, verbose); executor.execute(worker); } executor.shutdown(); try { if (!executor.awaitTermination(parameters.getParameterIntValue("luceneIndexingTimeout", 60 * 10), TimeUnit.SECONDS)) { // default 10 minutes throw new InterruptedException("Lucene indexing has run out of time."); } } catch (InterruptedException e) { throw new RuntimeException("Lucene indexing has been interrupted.", e); } finally { try { indexWriter.commit(); } catch (IOException e) { indexWriter.close(); // this may also throw an exception, but docs say to close on commit error throw e; } } // this should almost never be called if (parameters.containsKey("forceMerge")) { indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge")); } indexReader = DirectoryReader.open(indexWriter, true); storage.getLuceneManager().setDirectoryReader(indexReader); // make sure it's available afterwards // now determine which documents need to be analyzed Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>(); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) { if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze storedDocumentSourceForAnalysis.add(storedDocumentSource); } } if (storedDocumentSourceForAnalysis.isEmpty() == false) { indexSearcher = new IndexSearcher(indexReader); executor = Executors.newFixedThreadPool(processors); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForAnalysis) { if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze Runnable worker = new IndexedDocumentAnalyzer(storage, indexSearcher, storedDocumentSource, corpusId, verbose); executor.execute(worker); } } executor.shutdown(); try { if (!executor.awaitTermination( parameters.getParameterIntValue("luceneAnalysisTimeout", 60 * 10), TimeUnit.SECONDS)) { // default 10 minutes throw new InterruptedException("Lucene analysis has run out of time."); } } catch (InterruptedException e) { throw new RuntimeException("Lucene document analysis run out of time", e); } } } return corpusId; } private class IndexedDocumentAnalyzer implements Runnable { private Storage storage; private StoredDocumentSource storedDocumentSource; private IndexReader indexReader; private IndexSearcher indexSearcher; private String corpusId; private String id; private boolean verbose; public IndexedDocumentAnalyzer(Storage storage, IndexSearcher indexSearcher, StoredDocumentSource storedDocumentSource, String corpusId, boolean verbose) throws IOException { this.storage = storage; this.indexReader = indexSearcher.getIndexReader(); this.indexSearcher = indexSearcher; this.storedDocumentSource = storedDocumentSource; this.corpusId = corpusId; this.id = storedDocumentSource.getId(); this.verbose = verbose; } @Override public void run() { if (verbose) { // System.out.println("analyzing indexed document "+storedDocumentSource.getMetadata()); } Query query = new TermQuery(new Term("id", id)); TopDocs topDocs; try { topDocs = indexSearcher.search(query, 1); // there may be multiple documents in the index but they should have the same text int docId = topDocs.scoreDocs[0].doc; Terms terms = indexReader.getTermVector(docId, "lexical"); int totalTokens = 0; int totalTypes = 0; int lastOffset = 0; int lastPosition = 0; DescriptiveStatistics stats = new DescriptiveStatistics(); if (terms != null) { TermsEnum termsEnum = terms.iterator(); DocsAndPositionsEnum docsAndPositionsEnum = null; while (true) { BytesRef term = termsEnum.next(); if (term != null) { totalTypes++; docsAndPositionsEnum = termsEnum.docsAndPositions( new Bits.MatchAllBits(indexReader.maxDoc()), docsAndPositionsEnum, DocsAndPositionsEnum.FLAG_OFFSETS); while (true) { int doc = docsAndPositionsEnum.nextDoc(); if (doc != DocsAndPositionsEnum.NO_MORE_DOCS) { int freq = docsAndPositionsEnum.freq(); stats.addValue(freq); totalTokens += freq; for (int i = 0; i < freq; i++) { int pos = docsAndPositionsEnum.nextPosition(); if (pos > lastPosition) { lastPosition = pos; } int offset = docsAndPositionsEnum.startOffset(); if (offset > lastOffset) { lastOffset = offset; } } } else { break; } } } else { break; } } } DocumentMetadata metadata = storedDocumentSource.getMetadata(); metadata.setTypesCount(TokenType.lexical, totalTypes); metadata.setTokensCount(TokenType.lexical, totalTokens); metadata.setTypesCountMean(TokenType.lexical, (float) stats.getMean()); metadata.setTypesCountStdDev(TokenType.lexical, (float) stats.getStandardDeviation()); metadata.setLastTokenPositionIndex(TokenType.lexical, lastPosition); metadata.setLastTokenOffsetIndex(TokenType.lexical, lastOffset); storage.getStoredDocumentSourceStorage().updateStoredDocumentSourceMetadata(id, metadata); } catch (IOException e) { throw new RuntimeException("Unable to query document during index analysis.", e); } } } private class StoredDocumentSourceIndexer implements Runnable { private Storage storage; private StoredDocumentSource storedDocumentSource; private IndexWriter indexWriter; private IndexSearcher indexSearcher; private LuceneManager luceneManager; private String corpusId; private String id; // private String string = null; private boolean verbose; public StoredDocumentSourceIndexer(Storage storage, IndexWriter indexWriter, IndexSearcher indexSearcher, StoredDocumentSource storedDocumentSource, String corpusId, boolean verbose) throws IOException { this.storage = storage; this.indexWriter = indexWriter; this.indexSearcher = indexSearcher; this.storedDocumentSource = storedDocumentSource; this.luceneManager = storage.getLuceneManager(); this.corpusId = corpusId; this.id = storedDocumentSource.getId(); this.verbose = verbose; } private String getString() throws IOException { String string = ""; // if (this.string == null) { InputStream is = null; try { is = storage.getStoredDocumentSourceStorage().getStoredDocumentSourceInputStream(id); StringWriter sw = new StringWriter(); IOUtils.copy(is, sw); string = sw.toString(); } finally { if (is != null) is.close(); } // } return string; } @Override public void run() { if (verbose) { // System.out.println("indexing "+storedDocumentSource.getMetadata()); } try { TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("id", id)), 1); if (topDocs.totalHits > 0) { // already indexed return; } // this is used by lexical and the metadata (expecting term vectors to be present) FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Document document = new Document(); // create lexical document document = new Document(); document.add(new StringField("id", id, Field.Store.NO)); // document.add(new StringField("corpus", corpusId, Field.Store.NO)); document.add(new StringField("version", LucenePackage.get().getImplementationVersion(), Field.Store.YES)); FlexibleParameters p = new FlexibleParameters(); p.setParameter("language", storedDocumentSource.getMetadata().getLanguageCode()); if (parameters.getParameterValue("tokenization", "").isEmpty() == false) { p.setParameter("tokenization", parameters.getParameterValue("tokenization")); } document.add(new Field("lexical", getString() + "<!-- " + p.getAsQueryString() + " -->", ft)); // System.err.println(id+": "+getString()); FlexibleParameters params = storedDocumentSource.getMetadata().getFlexibleParameters(); for (String key : params.getKeys()) { // store term vector so that we can build term DB String v = params.getParameterValue(key); if (v != null && v.trim().isEmpty() == false) { document.add(new Field(key, v, ft)); } for (String value : params.getParameterValues(key)) { if (value.trim().isEmpty() == false) { // store as facet field document.add(new SortedSetDocValuesField("facet." + key, new BytesRef(value))); } } } // TODO: add lemmatization /* if (storedDocumentSource.getMetadata().getLanguageCode().equals("en")) { // FIXME: deal with other lemmatization languages document.add(new Field("lemmatized-en", getString(), ft)); } else { // next look for stemmed index if needed String lang = storedDocumentSource.getMetadata().getLanguageCode(); StemmableLanguage stemmableLanguage = StemmableLanguage.fromCode(lang); if (stemmableLanguage!=null) { document.add(new Field("stemmed-"+lang, getString(), ft)); } } */ indexWriter.addDocument(document); } catch (IOException e) { throw new RuntimeException("Unable to index stored document: " + storedDocumentSource, e); } } } }