Java tutorial
/******************************************************************************* * Copyright (c) 2010-2014 SAP AG and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * SAP AG - initial API and implementation *******************************************************************************/ package org.eclipse.skalli.core.search; import java.io.Closeable; import java.io.IOException; import java.text.MessageFormat; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Queue; import java.util.UUID; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.text.StrBuilder; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LimitTokenCountAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocsCollector; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.similar.MoreLikeThis; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.eclipse.skalli.model.EntityBase; import org.eclipse.skalli.model.ExtensibleEntityBase; import org.eclipse.skalli.services.entity.EntityService; import org.eclipse.skalli.services.extension.ExtensionService; import org.eclipse.skalli.services.extension.ExtensionServices; import org.eclipse.skalli.services.extension.Indexer; import org.eclipse.skalli.services.search.FacetedSearchResult; import org.eclipse.skalli.services.search.IndexEntry; import org.eclipse.skalli.services.search.PagingInfo; import org.eclipse.skalli.services.search.QueryParseException; import org.eclipse.skalli.services.search.SearchHit; import org.eclipse.skalli.services.search.SearchResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class LuceneIndex<T extends EntityBase> { private static final Version LUCENE_VERSION = Version.LUCENE_30; private static final Logger LOG = LoggerFactory.getLogger(LuceneIndex.class); private static final SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>"); //$NON-NLS-1$//$NON-NLS-2$ private static final String FIELD_UUID = "_uuid"; //$NON-NLS-1$ private static final int NUMBER_BEST_FRAGMENTS = 3; //TODO this is a candidate for configuration private Directory directory = new RAMDirectory(); private Analyzer analyzer = new LimitTokenCountAnalyzer(new StandardAnalyzer(LUCENE_VERSION), Integer.MAX_VALUE); private boolean initialized; private final EntityService<T> entityService; public LuceneIndex(EntityService<T> entityService) { this.entityService = entityService; } public synchronized void reindexAll() { directory = new RAMDirectory(); addEntitiesToIndex(entityService.getAll()); initialized = true; } public synchronized void reindex(Collection<T> entities) { directory = new RAMDirectory(); addEntitiesToIndex(entities); initialized = true; } private List<IndexEntry> indexEntity(T entity) { List<IndexEntry> fields = new LinkedList<IndexEntry>(); Queue<EntityBase> queue = new LinkedList<EntityBase>(); queue.add(entity); while (!queue.isEmpty()) { EntityBase currentEntity = queue.poll(); for (ExtensionService<?> extensionService : ExtensionServices.getAll()) { if (currentEntity.getClass().equals(extensionService.getExtensionClass())) { Indexer<?> indexer = extensionService.getIndexer(); if (indexer != null) { indexer.indexEntity(fields, currentEntity); } } } if (currentEntity instanceof ExtensibleEntityBase) { queue.addAll(((ExtensibleEntityBase) currentEntity).getAllExtensions()); } } return fields; } private void addEntityToIndex(IndexWriter writer, T entity) throws IOException { List<IndexEntry> fields = indexEntity(entity); Document doc = LuceneUtil.fieldsToDocument(fields); doc.add(new Field(FIELD_UUID, entity.getUuid().toString(), Store.YES, Index.NOT_ANALYZED)); writer.addDocument(doc); } List<SearchHit<T>> entitiesToHit(Collection<T> entities) { List<SearchHit<T>> ret = new LinkedList<SearchHit<T>>(); for (T entity : entities) { ret.add(entityToHit(entity)); } return ret; } SearchHit<T> entityToHit(T entity) { if (entity == null) { return null; } List<IndexEntry> fields = indexEntity(entity); Map<String, List<String>> storedValues = new HashMap<String, List<String>>(); for (IndexEntry entry : fields) { List<String> list = storedValues.get(entry.getFieldName()); if (list == null) { list = new LinkedList<String>(); storedValues.put(entry.getFieldName(), list); } list.add(entry.getValue()); } SearchHit<T> ret = new SearchHit<T>(entity, storedValues, storedValues); return ret; } private void addEntitiesToIndex(Collection<T> entities) { IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer); IndexWriter writer = null; try { writer = new IndexWriter(directory, config); for (T entity : entities) { if (!entity.isDeleted()) { addEntityToIndex(writer, entity); } } } catch (LockObtainFailedException e) { LOG.error("Failed to add index entries due to Lucene lock", e); } catch (Exception e) { LOG.error("Failed to add index entries", e); } finally { closeQuietly(writer); } } private String doHighlight(final Highlighter highlighter, final List<String> fields, final String fieldName, String fieldContents) throws IOException { String highlighted = fieldContents; if (fieldContents != null && fields.contains(fieldName)) { try { String[] fragments = highlighter.getBestFragments(analyzer, fieldName, fieldContents, NUMBER_BEST_FRAGMENTS); if (fragments != null && fragments.length > 0) { highlighted = LuceneUtil.withEllipsis(fragments, fieldContents); } } catch (Exception e) { LOG.error(MessageFormat.format("Failed to highlight search result ''{0}''", fieldContents), e); } } return highlighted; } private ScoreDoc getDocByUUID(IndexSearcher searcher, UUID uuid) throws IOException { Query query = null; try { QueryParser parser = new QueryParser(LUCENE_VERSION, FIELD_UUID, analyzer); query = parser.parse(StringUtils.lowerCase(uuid.toString())); } catch (ParseException e) { LOG.error(MessageFormat.format("Failed to create query from UUID {0}", uuid.toString()), e); return null; } TopScoreDocCollector collector = TopScoreDocCollector.create(2, false); searcher.search(query, collector); if (collector.getTotalHits() < 1) { return null; } if (collector.getTotalHits() > 1) { LOG.error(MessageFormat.format("Too many documents found with UUID {0}", uuid.toString())); return null; } ScoreDoc hit = collector.topDocs().scoreDocs[0]; return hit; } public synchronized void remove(final Collection<T> entities) { if (!initialized) { return; } IndexReader reader = null; IndexSearcher searcher = null; try { reader = IndexReader.open(directory, false); searcher = new IndexSearcher(reader); for (EntityBase entity : entities) { ScoreDoc hit = getDocByUUID(searcher, entity.getUuid()); if (hit != null) { searcher.getIndexReader().deleteDocument(hit.doc); } } } catch (LockObtainFailedException e) { LOG.error("Failed to remove index entries due to Lucene lock", e); } catch (Exception e) { LOG.error("Failed to remove index entries", e); } finally { closeQuietly(searcher); closeQuietly(reader); } } public synchronized void update(final Collection<T> entities) { if (!initialized) { return; } remove(entities); addEntitiesToIndex(entities); } private T getEntity(Document doc) { T ret = entityService.getByUUID(UUID.fromString(doc.get(FIELD_UUID))); return ret; } private SearchHit<T> getSearchHit(final Document doc, final List<String> fields, float score, final Highlighter highlighter) throws IOException { T entity = getEntity(doc); Map<String, List<String>> storedValues = new HashMap<String, List<String>>(); Map<String, List<String>> highlightedValues = new HashMap<String, List<String>>(); for (Fieldable f : doc.getFields()) { if (!f.isStored()) { continue; } String[] values = doc.getValues(f.name()); List<String> fieldContents = Arrays.asList(values); List<String> highlightedFieldContents = Arrays.asList(values.clone()); if (fields.contains(f.name())) { for (int i = 0; i < highlightedFieldContents.size(); i++) { highlightedFieldContents.set(i, doHighlight(highlighter, fields, f.name(), highlightedFieldContents.get(i))); } } storedValues.put(f.name(), fieldContents); highlightedValues.put(f.name(), highlightedFieldContents); } SearchHit<T> ret = new SearchHit<T>(entity, storedValues, score, highlightedValues); return ret; } public synchronized SearchResult<T> moreLikeThis(T entity, String[] fields, int count) { long start = System.nanoTime(); SearchResult<T> moreLikeThis = new SearchResult<T>(); List<SearchHit<T>> searchHits = new LinkedList<SearchHit<T>>(); PagingInfo pagingInfo = new PagingInfo(0, 0); int totalHitCount = 0; if (initialized) { IndexReader reader = null; IndexSearcher searcher = null; try { reader = IndexReader.open(directory); searcher = new IndexSearcher(reader); ScoreDoc baseDoc = getDocByUUID(searcher, entity.getUuid()); if (baseDoc != null) { MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setFieldNames(fields); mlt.setMinWordLen(2); mlt.setBoost(true); mlt.setMinDocFreq(0); mlt.setMinTermFreq(0); mlt.setAnalyzer(analyzer); Query query = mlt.like(baseDoc.doc); int numHits = Math.min(count + 1, entityService.size()); // count + 1: baseDoc will be one of the hits TopScoreDocCollector collector = TopScoreDocCollector.create(numHits, false); searcher.search(query, collector); List<String> fieldList = Arrays.asList(fields); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); for (ScoreDoc hit : collector.topDocs().scoreDocs) { if (hit.doc != baseDoc.doc) { Document doc = searcher.doc(hit.doc); SearchHit<T> searchHit = getSearchHit(doc, fieldList, hit.score, highlighter); searchHits.add(searchHit); } } pagingInfo = new PagingInfo(0, count); totalHitCount = collector.getTotalHits() - 1; } } catch (Exception e) { LOG.error( MessageFormat.format("Searching for entities similiar to ''{0}'' failed", entity.getUuid()), e); } finally { closeQuietly(searcher); closeQuietly(reader); } } long nanoDuration = System.nanoTime() - start; long milliDuration = Math.round(nanoDuration / 1000000d); moreLikeThis.setPagingInfo(pagingInfo); moreLikeThis.setResultCount(totalHitCount); moreLikeThis.setResult(searchHits); moreLikeThis.setDuration(milliDuration); moreLikeThis.setResult(searchHits); return moreLikeThis; } public synchronized SearchResult<T> search(String[] fields, String queryString, PagingInfo pagingInfo) throws QueryParseException { SearchResult<T> ret = new SearchResult<T>(); search(fields, null, queryString, pagingInfo, ret); return ret; } public synchronized SearchResult<T> searchPhrase(String[] fields, String queryString, PagingInfo pagingInfo) throws QueryParseException { return search(fields, "\"" + queryString + "\"", pagingInfo); //$NON-NLS-1$ //$NON-NLS-2$ } public synchronized FacetedSearchResult<T> facetedSearch(String[] fields, String[] facetFields, String queryString, PagingInfo pagingInfo) throws QueryParseException { FacetedSearchResult<T> ret = new FacetedSearchResult<T>(); search(fields, facetFields, queryString, pagingInfo, ret); return ret; } private <R extends SearchResult<T>> R search(final String[] fields, String facetFields[], final String queryString, PagingInfo pagingInfo, R ret) throws QueryParseException { long start = System.nanoTime(); List<SearchHit<T>> resultList = new LinkedList<SearchHit<T>>(); int totalHitCount = 0; if (pagingInfo == null) { pagingInfo = new PagingInfo(0, 10); } if (StringUtils.equals("*", queryString) || StringUtils.isEmpty(queryString)) { //$NON-NLS-1$ List<T> allEntities = entityService.getAll(); List<T> sublist = allEntities.subList(Math.min(pagingInfo.getStart(), allEntities.size()), Math.min(pagingInfo.getStart() + pagingInfo.getCount(), allEntities.size())); resultList.addAll(entitiesToHit(sublist)); totalHitCount = allEntities.size(); } else if (initialized) { List<String> fieldList = Arrays.asList(fields); IndexReader reader = null; IndexSearcher searcher = null; try { reader = IndexReader.open(directory); searcher = new IndexSearcher(reader); QueryParser parser = new MultiFieldQueryParser(LUCENE_VERSION, fields, analyzer); Query query = getQuery(parser, queryString); // it is not possible that we have more hits than projects! int maxHits = entityService.size(); int numHits = pagingInfo.getStart() + pagingInfo.getCount(); if (numHits < 0 || numHits > maxHits) { numHits = maxHits; } if (numHits > 0) { TopDocsCollector<ScoreDoc> collector; if (facetFields == null) { collector = TopScoreDocCollector.create(numHits, false); } else { collector = new FacetedCollector(facetFields, searcher.getIndexReader(), numHits); } searcher.search(query, collector); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); TopDocs topDocs = collector.topDocs(pagingInfo.getStart(), pagingInfo.getCount()); for (ScoreDoc hit : topDocs.scoreDocs) { Document doc = searcher.doc(hit.doc); SearchHit<T> searchHit = getSearchHit(doc, fieldList, hit.score, highlighter); resultList.add(searchHit); } totalHitCount = collector.getTotalHits(); if (collector instanceof FacetedCollector && ret instanceof FacetedSearchResult) { ((FacetedSearchResult<T>) ret).setFacetInfo(((FacetedCollector) collector).getFacetsMap()); } } } catch (Exception e) { LOG.error(MessageFormat.format("Searching with query ''{0}'' failed", queryString), e); } finally { closeQuietly(searcher); closeQuietly(reader); } } long nanoDuration = System.nanoTime() - start; long milliDuration = Math.round(nanoDuration / 1000000d); ret.setPagingInfo(pagingInfo); ret.setQueryString(queryString); ret.setResultCount(totalHitCount); ret.setResult(resultList); ret.setDuration(milliDuration); return ret; } private Query getQuery(QueryParser parser, String queryString) throws QueryParseException { Query query = null; String extendedQuery = getExtendedQuery(queryString); try { query = parser.parse(extendedQuery); } catch (ParseException e1) { // if the parsing fails escape the query string and try again String escapedQueryString = QueryParser.escape(queryString); try { query = parser.parse(escapedQueryString); } catch (ParseException ex) { // if that fails, too, give up throw new QueryParseException(ex); } } return query; } static String getExtendedQuery(String queryString) { StrBuilder extendedQuery = new StrBuilder(); if (StringUtils.isNotBlank(queryString)) { StrBuilder term = new StrBuilder(); boolean isSimpleTerm = true; boolean insideQuotes = false; boolean insideBrackets = false; char openedBracket = '\0'; int pos = 0; int len = queryString.length(); while (pos < len) { char c = queryString.charAt(pos++); if (c == '"') { isSimpleTerm = false; insideQuotes = !insideQuotes; term.append(c); } else if (c == '(' || c == '[' || c == '{') { isSimpleTerm = false; insideBrackets = true; openedBracket = c; term.append(c); } else if (c == ')' || c == ']' || c == '}') { isSimpleTerm = false; if (c == ')' && openedBracket == '(' || c == ']' && openedBracket == '[' || c == '}' && openedBracket == '{') { insideBrackets = false; openedBracket = '\0'; } term.append(c); } else if (insideQuotes || insideBrackets) { term.append(c); } else if (c == '*' || c == '?' || c == '~' || c == '+' || c == '-' || c == '!' || c == ':' || c == '^' || c == '|' || c == '&' || c == '\\') { isSimpleTerm = false; term.append(c); } else if (Character.isWhitespace(c)) { addTerm(extendedQuery, term, isSimpleTerm); isSimpleTerm = true; insideQuotes = false; insideBrackets = false; openedBracket = '\0'; term.setLength(0); } else { term.append(c); } } addTerm(extendedQuery, term, isSimpleTerm); } return extendedQuery.toString(); } private static final StrBuilder AND = new StrBuilder("AND"); //$NON-NLS-1$ private static final StrBuilder OR = new StrBuilder("OR"); //$NON-NLS-1$ private static final StrBuilder NOT = new StrBuilder("NOT"); //$NON-NLS-1$ private static final StrBuilder TO = new StrBuilder("TO"); //$NON-NLS-1$ static private void addTerm(StrBuilder query, StrBuilder term, boolean isSimpleTerm) { term.trim(); if (term.length() > 0) { if (query.length() > 0) { query.append(' '); } if (term.equals(AND) || term.equals(OR) || term.equals(NOT) || term.equals(TO)) { isSimpleTerm = false; } if (isSimpleTerm) { query.append('('); query.append('"').append(term).append('"'); query.append(' ').append(term).append('*'); query.append(' ').append(term).append('~'); query.append(')'); } else { query.append(term); } } } private void closeQuietly(Closeable closable) { try { if (closable != null) { closable.close(); } } catch (IOException e) { LOG.error(MessageFormat.format("Failed to close {0}", closable.getClass().getName()), e); } } }