Java tutorial
/* * Copyright (c) 2013. EMBL, European Bioinformatics Institute * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package uk.ac.ebi.mdk.service.query; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; import uk.ac.ebi.mdk.domain.identifier.Identifier; import uk.ac.ebi.mdk.service.AbstractService; import uk.ac.ebi.mdk.service.index.LuceneIndex; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** * AbstractLuceneService - 23.02.2012 <br/> * <p/> * Provides a base for which other lucene query services can build upon. This * class provides a lot of utility methods for building queries, accessing score * docs and field values. * * @author johnmay * @author $Author$ (this version) * @version $Rev$ */ public abstract class AbstractLuceneService<I extends Identifier> extends AbstractService<I> implements QueryService<I> { private static final Logger LOGGER = Logger.getLogger(AbstractLuceneService.class); private final Document EMPTY_DOCUMENT = new Document(); private static final int DEFAULT_CACHE_SIZE = 200; private int cacheSize; private Map<Integer, Document> documents = new LinkedHashMap<Integer, Document>() { @Override protected boolean removeEldestEntry(Map.Entry<Integer, Document> eldest) { return size() > cacheSize; } }; private Directory directory; private Analyzer analyzer; private IndexReader reader; private IndexSearcher searcher; private LuceneIndex index; private Map<String, QueryParser> parserMap = new HashMap<String, QueryParser>(); public AbstractLuceneService(LuceneIndex index) { this(index, DEFAULT_CACHE_SIZE); } public AbstractLuceneService(LuceneIndex index, int cacheSize) { this.index = index; this.cacheSize = cacheSize; } /** * Access the search used by the service * * @return */ public IndexSearcher getSearcher() { return searcher; } @Override public ServiceType getServiceType() { return ServiceType.LUCENE_INDEX; } /** * Set the directory for the index, this will also open the index reader * * @param directory * * @throws IOException */ public void setDirectory(Directory directory) throws IOException { if (reader != null) { reader.close(); } this.directory = directory; reader = IndexReader.open(directory, true); } /** * Set the anlayzer for the index * * @param analyzer */ public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } /** * @inheritDoc */ @Override public boolean startup() { if (index == null || !index.isAvailable()) { return false; } if (analyzer != null && directory != null && searcher != null) { return true; } try { analyzer = index.getAnalyzer(); setDirectory(index.getDirectory()); searcher = new IndexSearcher(directory, true); } catch (IOException ex) { LOGGER.error("startup() failed: " + ex.getMessage()); } return directory != null && analyzer != null && index != null && searcher != null; } /** * Access the analyzer for this service * * @return instance of the analyzer (or null if initialization failed) */ public Analyzer getAnalyzer() { return analyzer; } /** * Convenience method to access the first score document for a given query. * If multiple documents are found then an warning is logged. * * @param query search-able query * * @return the first score document for the query */ public ScoreDoc first(Query query) { ScoreDoc[] scoreDocs = search(query, TopScoreDocCollector.create(5, true)); if (scoreDocs.length > 1) { LOGGER.warn("Expected a single hit for " + query); } return scoreDocs.length > 0 ? scoreDocs[0] : null; } /** * Search the index with the provided query. A new TopScoreDocCollector is * created using and constrained using the value of {@see getMaxResults()}. * If an exception occurs an empty array of ScoreDoc's is returned. * * @param query search-able query * * @return the score documents for the query */ public ScoreDoc[] search(Query query) { return search(query, TopScoreDocCollector.create(getMaxResults(), true)); } /** * Search the index with the provided query and TopScoreDocCollector. If an * exception occurs an empty array of ScoreDoc's is returned * * @param query search-able query * @param collector the TopScoreDocCollector to use * * @return the score documents */ public ScoreDoc[] search(Query query, TopScoreDocCollector collector) { try { searcher.search(query, collector); return collector.topDocs().scoreDocs; } catch (IOException ex) { LOGGER.warn("Unable to search"); } return new ScoreDoc[0]; } /** * Access the binary value of the field with the given name. If multiple * value's exist this method will return the first value added. * * @param scoreDoc scored document to access the value for * @param field name of the field to retrieve the value from * * @return binary value of the field * * @throws IOException */ public byte[] binaryValue(ScoreDoc scoreDoc, String field) throws IOException { return getDocument(scoreDoc).getBinaryValue(field); } /** * Access the string values of the field with the given name. If multiple * value's exist this method will return the first value added. * * @param scoreDoc scored document to access the value for * @param field name of the field to retrieve the value from * * @return string values of the field * * @throws IOException */ public byte[][] binaryValues(ScoreDoc scoreDoc, String field) throws IOException { return getDocument(scoreDoc).getBinaryValues(field); } /** * Access the string value of the field with the given name. If multiple * value's exist this method will return the first value added. * * @param scoreDoc scored document to access the value for * @param field name of the field to retrieve the value from * * @return string value of the field * * @throws IOException */ public String value(ScoreDoc scoreDoc, String field) throws IOException { return getDocument(scoreDoc).get(field); } /** * Access the string values of the field with the given name. * * @param scoreDoc scored document to access the value for * @param field name of the field to retrieve the value from * * @return string values of the field * * @throws IOException */ public String[] values(ScoreDoc scoreDoc, String field) throws IOException { return getDocument(scoreDoc).getValues(field); } /** * Access the document corresponding to the provided score doc. this method * buffers access to the index reader allowing simple document retrieval * * @param document the score to to retrieve the {@see Document} for * * @return instance of the Document for the provided ScoreDoc * * @throws IOException thrown if no document was found */ public Document getDocument(ScoreDoc document) throws IOException { // return an empty document for null score docs if (document == null) return EMPTY_DOCUMENT; Integer index = document.doc; if (!documents.containsKey(index)) { documents.put(index, reader.document(index)); } return documents.get(index); } /** * Access the query parse for the specified term * * @param term the field to get the parser for * * @return query parser for the given term */ public QueryParser getParser(Term term) { return getParser(term.field()); } /** * Access the query parse for the specified field * * @param field the field to get the parser for * * @return query parser for the given field */ public QueryParser getParser(String field) { if (!parserMap.containsKey(field)) { parserMap.put(field, new QueryParser(Version.LUCENE_34, field, analyzer)); } return parserMap.get(field); } /** * Parse the query for the given term/field. This method will use the * appropriate query parser to construct a query from the 'query' parameter. * This allows flexibility when creating a query and parsing of more complex * searches: * <p/> * <pre>{@code * // variable length query * parse("start of a name*", NameService.NAME); * }</pre> * * @param query string query * @param term the field to search * * @return searchable query */ public Query parse(String query, Term term) { QueryParser parser = getParser(term); try { return parser.parse(query); } catch (ParseException ex) { LOGGER.error("Could not parse query " + query, ex); } return new TermQuery(term.createTerm(query)); } /** * Construct a non-approximate query without using the QueryParser. This is * useful when you want to search an field that is analyzed and maintain * space's. The token stream is converted into a boolean 'Must Occur' query. * For most simple queries this method can be used. * * @param text text to construct the query for * @param term the field to search the text in * * @return searchable query */ public Query construct(String text, Term term) { return construct(text, term, false); } /** * Construct a query without using the QueryParser. This is useful when you * want to search an field that is analyzed and maintain space's. The token * stream is converted into a boolean 'Must Occur' query. For most simple * queries this method can be used. The approximate flag allows construction * of approximate {@see FuzzyMatch} queries for each token. The similarity * for the fuzzy match can be set via the {@see setMinSimilarity(float)} * method. * * @param text text to construct the query for * @param term the field to search the text in * @param approximate whether to use approximate search * * @return searchable query */ public Query construct(String text, Term term, boolean approximate) { StringReader reader = new StringReader(text); TokenStream stream = analyzer.tokenStream(term.field(), reader); BooleanQuery query = new BooleanQuery(); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); try { while (stream.incrementToken()) { Term termToken = term.createTerm(termAttribute.toString()); Query subQuery = approximate ? new FuzzyQuery(termToken, getMinSimilarity()) : new TermQuery(termToken); query.add(subQuery, BooleanClause.Occur.MUST); } } catch (IOException ex) { LOGGER.error("Could not constructing query ", ex); } return query; } public Query construct(String text, boolean approximate, Term... terms) { return construct(text, terms, approximate); } public Query construct(String text, Term[] terms, boolean approximate) { BooleanQuery query = new BooleanQuery(false); for (Term term : terms) { StringReader reader = new StringReader(text); TokenStream stream = analyzer.tokenStream(term.field(), reader); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); BooleanQuery fieldQuery = new BooleanQuery(false); try { while (stream.incrementToken()) { Term termToken = term.createTerm(termAttribute.toString()); Query subQuery = approximate ? new FuzzyQuery(termToken, getMinSimilarity()) : new TermQuery(termToken); fieldQuery.add(subQuery, BooleanClause.Occur.MUST); } } catch (IOException ex) { LOGGER.error("Could not constructing query ", ex); } query.add(fieldQuery, BooleanClause.Occur.SHOULD); } return query; } /** * Convenience method that allows retrieval of the a value in the given * term/field for the provided identifier. An example would be accessing the * preferred name for an identifier. * <p/> * <pre>{@code * return firstValue(identifier, PreferredNameService.PREFERRED_NAME); * }</pre> * * @param identifier identifier to search for * @param term the term/field to access * * @return value stored in the field for the identifier (empty string it not * found) */ public String firstValue(Identifier identifier, Term term) { return firstValue(construct(identifier.getAccession(), IDENTIFIER), term.field()); } /** * Convenience method that allows retrieval of the all values in the given * term for the provided identifier. An example would be accessing all * synonyms for an identifier. * <p/> * <pre>{@code * return firstValues(identifier, SynonymService.SYNONYM); * }</pre> * * @param identifier identifier to search for * @param term the term/field to access * * @return collection of values */ public Collection<String> firstValues(Identifier identifier, Term term) { return firstValues(construct(identifier.getAccession(), IDENTIFIER), term.field()); } /** * Convenience method to access the value for the specified field in the * first document returned by the query. If no values are found an empty * array is returned. If multiple are found the first is returned. * * @param query the search query * @param term field to access * * @return binary value for specified field in the first document */ public byte[] firstBinaryValue(Query query, Term term) { return firstBinaryValue(query, term.field()); } /** * Access the value for the specified field in the first document returned * by the query. If no values are found an empty array is returned. If * multiple are found the first is returned. * * @param query the search query * @param field field to access * * @return binary value for specified field in the first document */ public byte[] firstBinaryValue(Query query, String field) { ScoreDoc scoreDoc = first(query); try { byte[] value = binaryValue(scoreDoc, field); if (value != null) return value; } catch (Exception ex) { LOGGER.error("Could not access field value " + field + " in service " + getClass() + " cause: " + ex.getCause() + " message: " + ex.getMessage()); } return new byte[0]; } /** * Access the value for the specified field in the first document returned * by the query. If no values are found an empty string is returned. If * multiple are found the first is returned. * * @param query the search query * @param term term to access * * @return value for specified field in the first document */ public String firstValue(Query query, Term term) { return firstValue(query, term.field()); } /** * Access the value for the specified field in the first document returned * by the query. If no values are found an empty string is returned. If * multiple are found the first is returned. * * @param query the search query * @param field field to access * * @return value for specified field in the first document */ public String firstValue(Query query, String field) { ScoreDoc scoreDoc = first(query); try { String value = value(scoreDoc, field); if (value != null) return value; } catch (Exception ex) { LOGGER.error("Could not access field value " + field + " in service " + getClass(), ex); } return ""; } /** * Convenience method to access the all-values for the specified field in * the first document returned by the query. If no values are found an empty * collection is returned. * * @param query the search query * @param term term to access * * @return all values for specified field in the first document */ public Collection<String> firstValues(Query query, Term term) { return firstValues(query, term.field()); } /** * Access the all-values for the specified field in the first document * returned by the query. If no values are found an empty collection is * returned. * * @param query the search query * @param field field to access * * @return all values for specified field in the first document */ public Collection<String> firstValues(Query query, String field) { ScoreDoc scoreDoc = first(query); List<String> values = new NonNullList<String>(5); if (scoreDoc == null) return values; try { for (String value : values(scoreDoc, field)) { values.add(value); } } catch (Exception ex) { LOGGER.error("Could not access field value " + field + " in service " + getClass(), ex); } return values; } /** * Convenience method to access the values for the specified field for all * score documents returned by the query. If multiple values exists for the * term only the first value will be returned. To access multi-value fields * for each document please use {@see allValues(Query, Term)}.Null values * are suppressed and not returned, if no values match and empty collection * is returned. * * @param query search query * @param term the term to access the value for * * @return ranked list of the first value of the specified field for each * document returned by the query */ public Collection<String> values(Query query, Term term) { return values(query, term.field()); } /** * Access the values for the specified field for all score documents * returned by the query. If multiple values exists for the field only the * first value will be returned. To access multi value fields for each * document please use {@see allValues(Query, Term)}. Null values are * suppressed and not returned, if no values match and empty collection is * returned. * * @param query search query * @param field the field to access the value for * * @return ranked list of the first value of the specified field for each * document returned by the query */ public Collection<String> values(Query query, String field) { ScoreDoc[] scoreDocs = search(query); List<String> values = new NonNullList<String>(scoreDocs.length); try { for (ScoreDoc scoreDoc : scoreDocs) { values.add(value(scoreDoc, field)); } } catch (Exception ex) { LOGGER.error("Could not access field value " + field + " in service " + getClass(), ex); } return values; } /** * Convenience method to access using a given term. This method allows * access to the values for the specified term. If multiple values exists * all values for that term will be added to the returned collection. Null * values are suppressed and not returned, if no values match and empty * collection is returned. * * @param query search query * @param term the field term to retrieve the values for * * @return Aggregated collection of values from single or multi-value * fields */ public Collection<String> allValues(Query query, Term term) { return allValues(query, term.field()); } /** * Access the values for the specified field. If multiple values exists all * values for that field will be added to the returned collection. Null * values are suppressed and not returned, if no values match and empty * collection is returned. * * @param query search query * @param field the field to retrieve the values for * * @return Aggregated collection of values from single or multi-value * fields */ public Collection<String> allValues(Query query, String field) { ScoreDoc[] scoreDocs = search(query); List<String> values = new NonNullList<String>(scoreDocs.length); try { for (ScoreDoc scoreDoc : scoreDocs) { for (String value : values(scoreDoc, field)) { values.add(value); } } } catch (Exception ex) { LOGGER.error("Could not access field value " + field + " in service " + getClass(), ex); } return values; } /** * Access a collection of identifiers for the given query. If you have * duplicate identifier fields in your index this method's return may * contain duplicate also. The identifier's are constructed from the {@see * QueryService#IDENTIFIER} field an subsequently if that field is missing * an empty set is returned. * * @param query the query to search for * * @return Ranked list of identifiers for the given query */ public Collection<I> getIdentifiers(Query query) { Collection<I> identifiers = new ArrayList<I>(); I base = getIdentifier(); for (String value : values(query, IDENTIFIER)) { I identifier = (I) base.newInstance(); identifier.setAccession(value); identifiers.add(identifier); } return identifiers; } /** * Simple utility class that will suppress null elements. An alternative * would be the Constraints available in Guava but that implementation * throws an NullPointerException if an null is attempted to be added * * @param <O> */ class NonNullList<O> extends ArrayList<O> { public NonNullList() { super(); } public NonNullList(int capacity) { super(capacity); } @Override public boolean add(O o) { if (o != null) return super.add(o); return false; } @Override public boolean addAll(Collection<? extends O> c) { boolean changed = false; for (O o : c) changed = add(o) || changed; return changed; } } /** * Emties document cached */ @Override public void renew() { super.renew(); documents.clear(); } }