List of usage examples for org.apache.lucene.search.highlight Highlighter getBestFragments
public final String getBestFragments(TokenStream tokenStream, String text, int maxNumFragments, String separator) throws IOException, InvalidTokenOffsetsException
From source file:org.jboss.seam.wiki.core.search.metamodel.SearchSupport.java
License:LGPL
/** * Returns the hits of the given query as fragments, highlighted, concatenated, and separated. * <p>//from ww w . ja v a2 s . co m * Pass in a <tt>NullFragmenter</tt> if you don't want any fragmentation by terms but * simply the hits highlighted. Otherwise, you will most likely use <tt>SimpleFragmenter</tt>. * The text you supply must be the same that was indexed, it will go through the same * analysis procedure to find the hits. Do not pass a different String than the one indexed * by Hibernate Search! If you use transparent string bridge with Hibernate Search, run the * bridge before passing the string into this method. * <p> * This method escapes any dangerous HTML characters in the indexed text and fragments by * replacing it with HTML entities. You can use the returned string directly to build a * <tt>SearchHit</tt>. * * @param query the query that produced hits * @param fragmenter a fragmenter that can split the indexed text * @param indexedText the original text that was analyzed and indexed by Hibernate Search (after any bridges!) * @param numOfFragments the number of fragments to include in the returned result * @param alternativeLength if there are no hits to highlight, how many characters of the original text to return * @return the fragmented, highglighted, and then concatenated substring of the indexed text */ protected String escapeBestFragments(Query query, Fragmenter fragmenter, String indexedText, int numOfFragments, int alternativeLength) { // The HTML escaping forces us to first fragment with internal placeholders... Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(INTERNAL_BEGIN_HIT, INTERNAL_END_HIT), new QueryScorer(query)); highlighter.setTextFragmenter(fragmenter); try { // Use the same analyzer as the indexer! TokenStream tokenStream = new StandardAnalyzer().tokenStream(null, new StringReader(indexedText)); String unescapedFragements = highlighter.getBestFragments(tokenStream, indexedText, numOfFragments, getFragmentSeparator()); String escapedFragments = WikiUtil.escapeHtml(WikiUtil.removeMacros(unescapedFragements), false, false); // .. and then replace the internal placeholders with real tags after HTML has been escaped escapedFragments = escapedFragments.replaceAll(INTERNAL_BEGIN_HIT, getBeginHitTag()); escapedFragments = escapedFragments.replaceAll(INTERNAL_END_HIT, getEndHitTag()); // Strip out macros // If no fragments were produced (no hits), return the original text as an alternative if (escapedFragments.length() == 0 && alternativeLength != 0) { return WikiUtil.escapeHtml(WikiUtil.removeMacros(indexedText.substring(0, indexedText.length() > alternativeLength ? alternativeLength : indexedText.length())), false, false); } else if (escapedFragments.length() == 0 && alternativeLength == 0) { return WikiUtil.escapeHtml(WikiUtil.removeMacros(indexedText), false, false); } return escapedFragments; } catch (Exception ex) { throw new RuntimeException(ex); } }
From source file:org.mskcc.pathdb.lucene.LuceneResults.java
License:Open Source License
/** * Grabs fragment of lucene field that matches query term & highlights term. * Method traverses the lucene fields indexed for match. If match is not found * null is returned./* w ww. jav a 2s .c o m*/ * * @param doc Lucene Document * @param highLighter QueryHighlightExtractor * @return String * @throws IOException */ private String getFragment(Document doc, Highlighter highLighter, String term) throws IOException { String[] fields = { LuceneConfig.FIELD_ALL, LuceneConfig.FIELD_SYNONYMS, LuceneConfig.FIELD_EXTERNAL_REFS }; for (String fieldName : fields) { // Get the Field of Interest Field field = doc.getField(fieldName); // Create the Token Stream TokenStream tokenStream = new StandardAnalyzer().tokenStream(LuceneConfig.FIELD_ALL, new StringReader(field.stringValue())); // Get the Best Fragment String formattedText = highLighter.getBestFragments(tokenStream, field.stringValue(), 5, "..."); if (formattedText != null && formattedText.length() > 0) { return formattedText; } } // made it here, assume descendent ? return null; }
From source file:org.mskcc.pathdb.tool.QueryFullText.java
License:Open Source License
/** * Executes Full Text Query.//from ww w. j a v a 2s. c o m * * @param term Search Term * @throws QueryException Lucene Query Error * @throws IOException I/O Error * @throws ParseException Lucene Parsing Error */ public static void queryFullText(String term) throws QueryException, IOException, ParseException { System.out.println("Using search term: " + term); LuceneReader luceneReader = new LuceneReader(); Hits hits = luceneReader.executeQuery(term); int num = Math.min(10, hits.length()); System.out.println("Total Number of Hits: " + hits.length()); if (hits.length() > 0) { // Standard Analyzer to extract words using a list of English stop words. StandardAnalyzer analyzer = new StandardAnalyzer(); // Standard Query Parser QueryParser queryParser = new QueryParser(LuceneConfig.FIELD_ALL, analyzer); // for the usage of highlighting with wildcards // Necessary to expand search terms IndexReader reader = IndexReader.open(new File(LuceneConfig.getLuceneDirectory())); Query luceneQuery = queryParser.parse(term); luceneQuery = luceneQuery.rewrite(reader); // Scorer implementation which scores text fragments by the number of // unique query terms found. QueryScorer queryScorer = new QueryScorer(luceneQuery); // HTML Formatted surrounds matching text with <B></B> tags. SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); // Highligher Class Highlighter highLighter = new Highlighter(htmlFormatter, queryScorer); // XXX Characters Max in Each Fragment Fragmenter fragmenter = new SimpleFragmenter(100); highLighter.setTextFragmenter(fragmenter); System.out.println("Showing hits: 0-" + (num - 1)); for (int i = 0; i < num; i++) { System.out.print("Hit " + i + ": "); // Get the Matching Hit Document doc = hits.doc(i); // Get the Field of Interest Field field = doc.getField(LuceneConfig.FIELD_ALL); // Create the Token Stream TokenStream tokenStream = new StandardAnalyzer().tokenStream(LuceneConfig.FIELD_ALL, new StringReader(field.stringValue())); // Get the Best Fragment String formattedText = highLighter.getBestFragments(tokenStream, field.stringValue(), 5, "..."); System.out.println(formattedText); } } }
From source file:org.olat.search.service.searcher.SearchResultsImpl.java
License:Apache License
/** * Highlight (bold,color) query words in result-document. Set HighlightResult for content or description. * /*from www.j a va 2 s . c o m*/ * @param query * @param analyzer * @param doc * @param resultDocument * @throws IOException */ private void doHighlight(final Query query, final Analyzer analyzer, final Document doc, final ResultDocument resultDocument) throws IOException { final Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(HIGHLIGHT_PRE_TAG, HIGHLIGHT_POST_TAG), new QueryScorer(query)); // Get 3 best fragments of content and seperate with a "..." try { // highlight content final String content = doc.get(AbstractOlatDocument.CONTENT_FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(AbstractOlatDocument.CONTENT_FIELD_NAME, new StringReader(content)); String highlightResult = highlighter.getBestFragments(tokenStream, content, 3, HIGHLIGHT_SEPARATOR); // if no highlightResult is in content => look in description if (highlightResult.length() == 0) { final String description = doc.get(AbstractOlatDocument.DESCRIPTION_FIELD_NAME); tokenStream = analyzer.tokenStream(AbstractOlatDocument.DESCRIPTION_FIELD_NAME, new StringReader(description)); highlightResult = highlighter.getBestFragments(tokenStream, description, 3, HIGHLIGHT_SEPARATOR); resultDocument.setHighlightingDescription(true); } resultDocument.setHighlightResult(highlightResult); // highlight title final String title = doc.get(AbstractOlatDocument.TITLE_FIELD_NAME); tokenStream = analyzer.tokenStream(AbstractOlatDocument.TITLE_FIELD_NAME, new StringReader(title)); final String highlightTitle = highlighter.getBestFragments(tokenStream, title, 3, " "); resultDocument.setHighlightTitle(highlightTitle); } catch (final InvalidTokenOffsetsException e) { log.warn("", e); } }
From source file:org.opencms.search.documents.CmsTermHighlighterHtml.java
License:Open Source License
/** * @see org.opencms.search.documents.I_CmsTermHighlighter#getExcerpt(org.apache.lucene.document.Document, org.opencms.search.CmsSearchIndex, org.opencms.search.CmsSearchParameters, org.apache.lucene.search.Query, org.apache.lucene.analysis.Analyzer) *//*w ww .j a v a 2 s . co m*/ public String getExcerpt(Document doc, CmsSearchIndex index, CmsSearchParameters params, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException { if ((doc == null) || (index == null) || (params == null) || (analyzer == null) || (query == null)) { return null; } Highlighter highlighter = null; Iterator<String> excerptFieldNames = index.getFieldConfiguration().getExcerptFieldNames().iterator(); StringBuffer excerptBuffer = new StringBuffer(); while (excerptFieldNames.hasNext()) { String fieldName = excerptFieldNames.next(); boolean createExcerpt = !params.isExcerptOnlySearchedFields() || params.getFields().contains(fieldName); if (createExcerpt && (doc.getFieldable(fieldName) != null)) { // only generate field excerpt if the field is available in the document String text = doc.getFieldable(fieldName).stringValue(); // make sure all XML in the text is escaped, otherwise excerpt HTML output may be garbled text = CmsEncoder.escapeXml(text); TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); if (params.isExcerptOnlySearchedFields()) { // highlight the search query only in the matching fields highlighter = new Highlighter(new QueryScorer(query, fieldName)); } else { // highlight search query in all fields if (highlighter == null) { highlighter = new Highlighter(new QueryScorer(query)); } } String fragment = highlighter.getBestFragments(stream, text, EXCERPT_REQUIRED_FRAGMENTS, EXCERPT_FRAGMENT_SEPARATOR); // kill all unwanted chars in the excerpt fragment = fragment.replace('\t', ' '); fragment = fragment.replace('\n', ' '); fragment = fragment.replace('\r', ' '); fragment = fragment.replace('\f', ' '); if (excerptBuffer.length() > 0) { // this is not the first fragment excerptBuffer.append(EXCERPT_FRAGMENT_SEPARATOR); } excerptBuffer.append(fragment); } } String result = null; if (excerptBuffer.length() > 0) { result = excerptBuffer.toString(); } int maxLength = OpenCms.getSearchManager().getMaxExcerptLength(); if ((result != null) && (result.length() > maxLength)) { result = result.substring(0, maxLength); } return result; }
From source file:org.openrdf.sail.lucene.LuceneQueryIterator.java
License:BSD License
/** * Tries to find the next Bindings Set (results iterator) if there is none currently. * It prepares the next permutation of hits, binds the respective variables and * evaluates the query tree against the underlying sail. The results iterator is then * stored as this.nextBindingSets. If this method fails to provide a next bindings set, * it returns false./*www . j a va 2 s. c om*/ * @return true if it succeeded, false otherwise */ private boolean findNextBindingSets() { // if there is still a next bindings set, we can safely return if (this.nextBindingSets != null) return true; // check if more permutations are available if (this.permutations.isInvalid()) return false; // get the current permutation and the queries Vector<Integer> permutation = this.permutations.val(); Iterator<QuerySpec> queries = this.queries.iterator(); // this takes the new bindings derivedBindings = new QueryBindingSet(); // for each digit ... for (Integer id : permutation) { // get the respective query (the query this digit stands for) if (!queries.hasNext()) { log.warn("There are more permutation digits then there are query specs!"); return false; // TODO: do we want to return true or false here? } QuerySpec query = queries.next(); // if no hits are available, this binding set failed if (id <= 0) return false; // get the hit indicated by the digit value Document doc = getDoc(query, id - 1); if (doc == null) return false; // TODO: do we want to return true or false here? // get the score of the hit float score = getScore(query, id - 1); // bind the respective variables String matchVar = query.getMatchesVariableName(); if (matchVar != null) { Resource resource = this.index.getResource(doc); Value existing = derivedBindings.getValue(matchVar); // if the existing binding contradicts the current binding, than we can safely skip this permutation if ((existing != null) && (!existing.stringValue().equals(resource.stringValue()))) { // invalidate the binding derivedBindings = null; // and exit the loop break; } derivedBindings.addBinding(matchVar, resource); } if ((query.getScoreVariableName() != null) && (score > 0.0f)) derivedBindings.addBinding(query.getScoreVariableName(), scoreToLiteral(score)); if (query.getSnippetVariableName() != null) { // get the highlighter of this query Highlighter highlighter = this.highlighters.get(query); if (highlighter != null) { // extract snippets from // Lucene's query results StringBuffer result = new StringBuffer(); // limit to the queried field, if there was one String fieldname = LuceneIndex.TEXT_FIELD_NAME; if (query.getPropertyURI() != null) fieldname = query.getPropertyURI().toString(); Field[] fields = doc.getFields(fieldname); int lastLen = 0; for (Field field : fields) { String text = field.stringValue(); TokenStream tokenStream = this.index.getAnalyzer().tokenStream(LuceneIndex.TEXT_FIELD_NAME, new StringReader(text)); String next = ""; try { next = highlighter.getBestFragments(tokenStream, text, 2, "..."); } catch (IOException e) { log.error("IOException while getting snippet for filed " + field.name() + " for query\n" + query, e); continue; } catch (InvalidTokenOffsetsException e) { log.error("InvalidTokenOffsetsException while getting snippet for filed " + field.name() + " for query\n" + query, e); continue; } if (next.length() > 0) { if (lastLen > 0) { result.append("..."); } lastLen = next.length(); result.append(next); } } derivedBindings.addBinding(query.getSnippetVariableName(), new LiteralImpl(result.toString())); } else { log.warn( "Lucene Query requests snippet, but no highlighter was generated for it, no snippets will be generated!\n{}", query); } } } // the derived bindings are used to extend the results of the following evaluation (the results do not contain the given bindings) // the bindings given to the LuceneSail shall not be included in its results, so we add them here, but won't include them in the results QueryBindingSet evaluateBindings = new QueryBindingSet(this.bindings); evaluateBindings.addAll(derivedBindings); // finally, evaluate the bindings against the underlying store try { if (derivedBindings != null) { this.nextBindingSets = this.sailConn.evaluate(query, derivedBindings, includeInferred); } } catch (Exception e) { log.error("Provided sail connection could not evaluate tuple expression!", e); return false; // TODO: do we want to return true or false here? } // go to the next permutation, if this was the last one, // invalidate the permutation instance, which will be check // at the beginning of the next call of the findNextBindingSets method if (this.permutations.next()) { this.permutations.invalidate(); } // we succeeded return true; }
From source file:org.openrdf.sail.lucene.LuceneSailConnection.java
License:BSD License
/** * This method generates bindings from the given result of a Lucene query. * //from w ww .j av a2 s . c o m * @param query * the Lucene query * @param hits * the query result * @param highlighter * a Highlighter for the query * @return a LinkedHashSet containing generated bindings * @throws SailException */ private LinkedHashSet<BindingSet> generateBindingSets(QuerySpec query, TopDocs hits, Highlighter highlighter) throws SailException { // Since one resource can be returned many times, it can lead now to // multiple occurrences // of the same binding tuple in the BINDINGS clause. This in turn leads to // duplicate answers in the original SPARQL query. // We want to avoid this, so BindingSets added to the result must be // unique. LinkedHashSet<BindingSet> bindingSets = new LinkedHashSet<BindingSet>(); // for each hit ... ScoreDoc[] docs = hits.scoreDocs; for (int i = 0; i < docs.length; i++) { // this takes the new bindings QueryBindingSet derivedBindings = new QueryBindingSet(); // get the current hit int docId = docs[i].doc; Document doc = getDoc(docId); if (doc == null) continue; // get the score of the hit float score = docs[i].score; // bind the respective variables String matchVar = query.getMatchesVariableName(); if (matchVar != null) { try { Resource resource = this.luceneIndex.getResource(doc); Value existing = derivedBindings.getValue(matchVar); // if the existing binding contradicts the current binding, than // we can safely skip this permutation if ((existing != null) && (!existing.stringValue().equals(resource.stringValue()))) { // invalidate the binding derivedBindings = null; // and exit the loop break; } derivedBindings.addBinding(matchVar, resource); } catch (NullPointerException e) { SailException e1 = new SailException( "NullPointerException when retrieving a resource from LuceneSail. Possible cause is the obsolete index structure. Re-creating the index can help", e); logger.error(e1.getMessage()); logger.debug("Details: ", e); throw e1; } } if ((query.getScoreVariableName() != null) && (score > 0.0f)) derivedBindings.addBinding(query.getScoreVariableName(), scoreToLiteral(score)); if (query.getSnippetVariableName() != null) { if (highlighter != null) { // limit to the queried field, if there was one Fieldable[] fields; if (query.getPropertyURI() != null) { String fieldname = query.getPropertyURI().toString(); fields = doc.getFieldables(fieldname); } else { fields = this.luceneIndex.getPropertyFields(doc.getFields()); } // extract snippets from Lucene's query results for (Fieldable field : fields) { // create an individual binding set for each snippet QueryBindingSet snippetBindings = new QueryBindingSet(derivedBindings); String text = field.stringValue(); TokenStream tokenStream = this.luceneIndex.getAnalyzer().tokenStream(field.name(), new StringReader(text)); String fragments = null; try { fragments = highlighter.getBestFragments(tokenStream, text, 2, "..."); } catch (Exception e) { logger.error("Exception while getting snippet for filed " + field.name() + " for query\n" + query, e); continue; } if (fragments != null && !fragments.isEmpty()) { snippetBindings.addBinding(query.getSnippetVariableName(), new SimpleLiteral(fragments)); if (query.getPropertyVariableName() != null && query.getPropertyURI() == null) { snippetBindings.addBinding(query.getPropertyVariableName(), new SimpleIRI(field.name())); } bindingSets.add(snippetBindings); } } } else { logger.warn( "Lucene Query requests snippet, but no highlighter was generated for it, no snippets will be generated!\n{}", query); bindingSets.add(derivedBindings); } } else { bindingSets.add(derivedBindings); } } // we succeeded return bindingSets; }
From source file:org.openrdf.sail.lucene3.LuceneIndex.java
License:BSD License
public String getSnippet(String fieldName, String text, Highlighter highlighter) { String snippet;// w w w . jav a2s . co m try { TokenStream tokenStream = getAnalyzer().tokenStream(fieldName, new StringReader(text)); snippet = highlighter.getBestFragments(tokenStream, text, 2, "..."); } catch (Exception e) { logger.error("Exception while getting snippet for field " + fieldName, e); snippet = null; } return snippet; }
From source file:org.paxle.se.index.lucene.impl.SnippetFetcher.java
License:Open Source License
public String getSnippet(Query query, String locationStr) { Reader textReader = null;/*from ww w. j a v a 2 s . c om*/ try { // creating a dummy command URI locationURI = URI.create(locationStr); ICommand cmd = this.docFactory.createDocument(ICommand.class); cmd.setLocation(locationURI); // crawling the resource this.crawler.process(cmd); if (cmd.getResult() != Result.Passed) return null; // parsing the resource this.parser.process(cmd); if (cmd.getResult() != Result.Passed) return null; // trying to get the parsed content IParserDocument pdoc = cmd.getParserDocument(); if (pdoc == null) return null; else if (pdoc.getStatus() != Status.OK) return null; // getting the document content textReader = pdoc.getTextAsReader(); if (textReader == null) return null; // reading some text StringBuilder text = new StringBuilder(); this.ioTools.copy(textReader, text, 10240); final Highlighter highlighter = new Highlighter(new QueryScorer(query)); final TokenStream tokenStream = this.analyzer.tokenStream("content", new StringReader(text.toString())); final String result = highlighter.getBestFragments(tokenStream, text.toString(), 3, "..."); return result; } catch (Throwable e) { this.logger.error(e.getMessage(), e); } finally { // closing reader if (textReader != null) { try { textReader.close(); } catch (Exception e) { this.logger.error(e.getMessage(), e); } } } return null; }
From source file:org.sakaiproject.search.component.service.impl.SearchResultImpl.java
License:Educational Community License
public String getSearchResult() { try {//from w w w. j a v a 2 s .c o m Scorer scorer = new QueryScorer(query); Highlighter hightlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), scorer); StringBuilder sb = new StringBuilder(); // contents no longer contains the digested contents, so we need to // fetch it from the EntityContentProducer byte[][] references = doc.getBinaryValues(SearchService.FIELD_REFERENCE); DigestStorageUtil digestStorageUtil = new DigestStorageUtil(searchService); if (references != null && references.length > 0) { for (int i = 0; i < references.length; i++) { EntityContentProducer sep = searchIndexBuilder .newEntityContentProducer(CompressionTools.decompressString(references[i])); if (sep != null) { //does this ecp store on the FS? if (sep instanceof StoredDigestContentProducer) { String digestCount = doc.get(SearchService.FIELD_DIGEST_COUNT); if (digestCount == null) { digestCount = "1"; } log.debug("This file possibly has FS digests with index of " + digestCount); StringBuilder sb1 = digestStorageUtil.getFileContents(CompressionTools.decompressString( doc.getBinaryValue(SearchService.FIELD_REFERENCE)), digestCount); if (sb1.length() > 0) { sb.append(sb1); } else { String digest = sep.getContent(CompressionTools.decompressString(references[i])); sb.append(digest); //we need to save this digestStorageUtil.saveContentToStore( CompressionTools.decompressString( doc.getBinaryValue(SearchService.FIELD_REFERENCE)), sb.toString(), 1); } } else { sb.append(CompressionTools.decompressString(references[i])); } } } } String text = sb.toString(); TokenStream tokenStream = analyzer.tokenStream(SearchService.FIELD_CONTENTS, new StringReader(text)); return hightlighter.getBestFragments(tokenStream, text, 5, " ... "); //$NON-NLS-1$ } catch (IOException e) { return Messages.getString("SearchResultImpl.2") + e.getMessage(); //$NON-NLS-1$ } catch (InvalidTokenOffsetsException e) { return Messages.getString("SearchResultResponseImpl.11") + e.getMessage(); } catch (DataFormatException e) { e.printStackTrace(); return Messages.getString("SearchResultResponseImpl.11") + e.getMessage(); } }