Example usage for org.apache.lucene.search.highlight Highlighter getBestFragments

public final String getBestFragments(TokenStream tokenStream, String text, int maxNumFragments,
        String separator) throws IOException, InvalidTokenOffsetsException 

Source Link


Highlights terms in the text , extracting the most relevant sections and concatenating the chosen fragments with a separator (typically "...").


From source file:org.jboss.seam.wiki.core.search.metamodel.SearchSupport.java


 * Returns the hits of the given query as fragments, highlighted, concatenated, and separated.
 * <p>
 * Pass in a <tt>NullFragmenter</tt> if you don't want any fragmentation by terms but
 * simply the hits highlighted. Otherwise, you will most likely use <tt>SimpleFragmenter</tt>.
 * The text you supply must be the same that was indexed, it will go through the same
 * analysis procedure to find the hits. Do not pass a different String than the one indexed
 * by Hibernate Search! If you use transparent string bridge with Hibernate Search, run the
 * bridge before passing the string into this method.
 * <p>
 * This method escapes any dangerous HTML characters in the indexed text and fragments by
 * replacing it with HTML entities. You can use the returned string directly to build a
 * <tt>SearchHit</tt>.
 * @param query the query that produced hits
 * @param fragmenter a fragmenter that can split the indexed text
 * @param indexedText the original text that was analyzed and indexed by Hibernate Search (after any bridges!)
 * @param numOfFragments the number of fragments to include in the returned result
 * @param alternativeLength if there are no hits to highlight, how many characters of the original text to return
 * @return the fragmented, highglighted, and then concatenated substring of the indexed text
protected String escapeBestFragments(Query query, Fragmenter fragmenter, String indexedText, int numOfFragments,
        int alternativeLength) {

    // The HTML escaping forces us to first fragment with internal placeholders...
    Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(INTERNAL_BEGIN_HIT, INTERNAL_END_HIT),
            new QueryScorer(query));
    try {
        // Use the same analyzer as the indexer!
        TokenStream tokenStream = new StandardAnalyzer().tokenStream(null, new StringReader(indexedText));

        String unescapedFragements = highlighter.getBestFragments(tokenStream, indexedText, numOfFragments,

        String escapedFragments = WikiUtil.escapeHtml(WikiUtil.removeMacros(unescapedFragements), false, false);

        // .. and then replace the internal placeholders with real tags after HTML has been escaped
        escapedFragments = escapedFragments.replaceAll(INTERNAL_BEGIN_HIT, getBeginHitTag());
        escapedFragments = escapedFragments.replaceAll(INTERNAL_END_HIT, getEndHitTag());

        // Strip out macros

        // If no fragments were produced (no hits), return the original text as an alternative
        if (escapedFragments.length() == 0 && alternativeLength != 0) {
            return WikiUtil.escapeHtml(WikiUtil.removeMacros(indexedText.substring(0,
                    indexedText.length() > alternativeLength ? alternativeLength : indexedText.length())),
                    false, false);
        } else if (escapedFragments.length() == 0 && alternativeLength == 0) {
            return WikiUtil.escapeHtml(WikiUtil.removeMacros(indexedText), false, false);

        return escapedFragments;

    } catch (Exception ex) {
        throw new RuntimeException(ex);

From source file:org.mskcc.pathdb.lucene.LuceneResults.java

License:Open Source License

 * Grabs fragment of lucene field that matches query term & highlights term.
 * Method traverses the lucene fields indexed for match.  If match is not found
 * null is returned.
 * @param doc Lucene Document
 * @param highLighter QueryHighlightExtractor
 * @return String
 * @throws IOException
private String getFragment(Document doc, Highlighter highLighter, String term) throws IOException {

    String[] fields = { LuceneConfig.FIELD_ALL, LuceneConfig.FIELD_SYNONYMS, LuceneConfig.FIELD_EXTERNAL_REFS };

    for (String fieldName : fields) {
        //  Get the Field of Interest
        Field field = doc.getField(fieldName);

        //  Create the Token Stream
        TokenStream tokenStream = new StandardAnalyzer().tokenStream(LuceneConfig.FIELD_ALL,
                new StringReader(field.stringValue()));

        //  Get the Best Fragment
        String formattedText = highLighter.getBestFragments(tokenStream, field.stringValue(), 5, "...");
        if (formattedText != null && formattedText.length() > 0) {
            return formattedText;

    // made it here, assume descendent ?
    return null;

From source file:org.mskcc.pathdb.tool.QueryFullText.java

License:Open Source License

 * Executes Full Text Query.//from ww w. j  a v  a  2s. c  o m
 * @param term Search Term
 * @throws QueryException Lucene Query Error
 * @throws IOException    I/O Error
 * @throws ParseException Lucene Parsing Error
public static void queryFullText(String term) throws QueryException, IOException, ParseException {
    System.out.println("Using search term:  " + term);
    LuceneReader luceneReader = new LuceneReader();
    Hits hits = luceneReader.executeQuery(term);
    int num = Math.min(10, hits.length());
    System.out.println("Total Number of Hits:  " + hits.length());
    if (hits.length() > 0) {

        //  Standard Analyzer to extract words using a list of English stop words.
        StandardAnalyzer analyzer = new StandardAnalyzer();

        //  Standard Query Parser
        QueryParser queryParser = new QueryParser(LuceneConfig.FIELD_ALL, analyzer);

        // for the usage of highlighting with wildcards
        // Necessary to expand search terms
        IndexReader reader = IndexReader.open(new File(LuceneConfig.getLuceneDirectory()));
        Query luceneQuery = queryParser.parse(term);
        luceneQuery = luceneQuery.rewrite(reader);

        //  Scorer implementation which scores text fragments by the number of
        //  unique query terms found.
        QueryScorer queryScorer = new QueryScorer(luceneQuery);

        //  HTML Formatted surrounds matching text with <B></B> tags.
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();

        //  Highligher Class
        Highlighter highLighter = new Highlighter(htmlFormatter, queryScorer);

        //  XXX Characters Max in Each Fragment
        Fragmenter fragmenter = new SimpleFragmenter(100);

        System.out.println("Showing hits:  0-" + (num - 1));
        for (int i = 0; i < num; i++) {
            System.out.print("Hit " + i + ":  ");

            //  Get the Matching Hit
            Document doc = hits.doc(i);

            //  Get the Field of Interest
            Field field = doc.getField(LuceneConfig.FIELD_ALL);

            //  Create the Token Stream
            TokenStream tokenStream = new StandardAnalyzer().tokenStream(LuceneConfig.FIELD_ALL,
                    new StringReader(field.stringValue()));

            //  Get the Best Fragment
            String formattedText = highLighter.getBestFragments(tokenStream, field.stringValue(), 5, "...");

From source file:org.olat.search.service.searcher.SearchResultsImpl.java

License:Apache License

 * Highlight (bold,color) query words in result-document. Set HighlightResult for content or description.
 * 
 *
 * @param query
 * @param analyzer
 * @param doc
 * @param resultDocument
 * @throws IOException
private void doHighlight(final Query query, final Analyzer analyzer, final Document doc,
        final ResultDocument resultDocument) throws IOException {
    final Highlighter highlighter = new Highlighter(
            new SimpleHTMLFormatter(HIGHLIGHT_PRE_TAG, HIGHLIGHT_POST_TAG), new QueryScorer(query));
    // Get 3 best fragments of content and seperate with a "..."
    try {
        // highlight content
        final String content = doc.get(AbstractOlatDocument.CONTENT_FIELD_NAME);
        TokenStream tokenStream = analyzer.tokenStream(AbstractOlatDocument.CONTENT_FIELD_NAME,
                new StringReader(content));
        String highlightResult = highlighter.getBestFragments(tokenStream, content, 3, HIGHLIGHT_SEPARATOR);

        // if no highlightResult is in content => look in description
        if (highlightResult.length() == 0) {
            final String description = doc.get(AbstractOlatDocument.DESCRIPTION_FIELD_NAME);
            tokenStream = analyzer.tokenStream(AbstractOlatDocument.DESCRIPTION_FIELD_NAME,
                    new StringReader(description));
            highlightResult = highlighter.getBestFragments(tokenStream, description, 3, HIGHLIGHT_SEPARATOR);

        // highlight title
        final String title = doc.get(AbstractOlatDocument.TITLE_FIELD_NAME);
        tokenStream = analyzer.tokenStream(AbstractOlatDocument.TITLE_FIELD_NAME, new StringReader(title));
        final String highlightTitle = highlighter.getBestFragments(tokenStream, title, 3, " ");
    } catch (final InvalidTokenOffsetsException e) {
        log.warn("", e);

From source file:org.opencms.search.documents.CmsTermHighlighterHtml.java

License:Open Source License

 * @see org.opencms.search.documents.I_CmsTermHighlighter#getExcerpt(org.apache.lucene.document.Document, org.opencms.search.CmsSearchIndex, org.opencms.search.CmsSearchParameters, org.apache.lucene.search.Query, org.apache.lucene.analysis.Analyzer)
 */
public String getExcerpt(Document doc, CmsSearchIndex index, CmsSearchParameters params, Query query,
        Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {

    if ((doc == null) || (index == null) || (params == null) || (analyzer == null) || (query == null)) {
        return null;
    Highlighter highlighter = null;
    Iterator<String> excerptFieldNames = index.getFieldConfiguration().getExcerptFieldNames().iterator();
    StringBuffer excerptBuffer = new StringBuffer();
    while (excerptFieldNames.hasNext()) {
        String fieldName = excerptFieldNames.next();
        boolean createExcerpt = !params.isExcerptOnlySearchedFields() || params.getFields().contains(fieldName);
        if (createExcerpt && (doc.getFieldable(fieldName) != null)) {
            // only generate field excerpt if the field is available in the document
            String text = doc.getFieldable(fieldName).stringValue();
            // make sure all XML in the text is escaped, otherwise excerpt HTML output may be garbled
            text = CmsEncoder.escapeXml(text);

            TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));

            if (params.isExcerptOnlySearchedFields()) {
                // highlight the search query only in the matching fields 
                highlighter = new Highlighter(new QueryScorer(query, fieldName));
            } else {
                // highlight search query in all fields
                if (highlighter == null) {
                    highlighter = new Highlighter(new QueryScorer(query));
            String fragment = highlighter.getBestFragments(stream, text, EXCERPT_REQUIRED_FRAGMENTS,

            // kill all unwanted chars in the excerpt
            fragment = fragment.replace('\t', ' ');
            fragment = fragment.replace('\n', ' ');
            fragment = fragment.replace('\r', ' ');
            fragment = fragment.replace('\f', ' ');

            if (excerptBuffer.length() > 0) {
                // this is not the first fragment

    String result = null;
    if (excerptBuffer.length() > 0) {
        result = excerptBuffer.toString();

    int maxLength = OpenCms.getSearchManager().getMaxExcerptLength();
    if ((result != null) && (result.length() > maxLength)) {
        result = result.substring(0, maxLength);

    return result;

From source file:org.openrdf.sail.lucene.LuceneQueryIterator.java

License:BSD License

 * Tries to find the next Bindings Set (results iterator) if there is none currently.
 * It prepares the next permutation of hits, binds the respective variables and
 * evaluates the query tree against the underlying sail. The results iterator is then
 * stored as this.nextBindingSets. If this method fails to provide a next bindings set,
 * @return true if it succeeded, false otherwise
 * @return true if it succeeded, false otherwise
private boolean findNextBindingSets() {
    // if there is still a next bindings set, we can safely return
    if (this.nextBindingSets != null)
        return true;

    // check if more permutations are available
    if (this.permutations.isInvalid())
        return false;

    // get the current permutation and the queries
    Vector<Integer> permutation = this.permutations.val();
    Iterator<QuerySpec> queries = this.queries.iterator();

    // this takes the new bindings
    derivedBindings = new QueryBindingSet();

    // for each digit ...
    for (Integer id : permutation) {
        // get the respective query (the query this digit stands for)
        if (!queries.hasNext()) {
            log.warn("There are more permutation digits then there are query specs!");
            return false; // TODO: do we want to return true or false here?
        QuerySpec query = queries.next();

        // if no hits are available, this binding set failed
        if (id <= 0)
            return false;

        // get the hit indicated by the digit value
        Document doc = getDoc(query, id - 1);
        if (doc == null)
            return false; // TODO: do we want to return true or false here?

        // get the score of the hit
        float score = getScore(query, id - 1);

        // bind the respective variables
        String matchVar = query.getMatchesVariableName();
        if (matchVar != null) {
            Resource resource = this.index.getResource(doc);
            Value existing = derivedBindings.getValue(matchVar);
            // if the existing binding contradicts the current binding, than we can safely skip this permutation
            if ((existing != null) && (!existing.stringValue().equals(resource.stringValue()))) {
                // invalidate the binding
                derivedBindings = null;

                // and exit the loop
            derivedBindings.addBinding(matchVar, resource);

        if ((query.getScoreVariableName() != null) && (score > 0.0f))
            derivedBindings.addBinding(query.getScoreVariableName(), scoreToLiteral(score));

        if (query.getSnippetVariableName() != null) {
            // get the highlighter of this query
            Highlighter highlighter = this.highlighters.get(query);
            if (highlighter != null) {
                // extract snippets from
                // Lucene's query results
                StringBuffer result = new StringBuffer();

                // limit to the queried field, if there was one
                String fieldname = LuceneIndex.TEXT_FIELD_NAME;
                if (query.getPropertyURI() != null)
                    fieldname = query.getPropertyURI().toString();
                Field[] fields = doc.getFields(fieldname);
                int lastLen = 0;
                for (Field field : fields) {
                    String text = field.stringValue();
                    TokenStream tokenStream = this.index.getAnalyzer().tokenStream(LuceneIndex.TEXT_FIELD_NAME,
                            new StringReader(text));
                    String next = "";
                    try {
                        next = highlighter.getBestFragments(tokenStream, text, 2, "...");
                    } catch (IOException e) {
                        log.error("IOException while getting snippet for filed " + field.name() + " for query\n"
                                + query, e);
                    } catch (InvalidTokenOffsetsException e) {
                        log.error("InvalidTokenOffsetsException while getting snippet for filed " + field.name()
                                + " for query\n" + query, e);

                    if (next.length() > 0) {
                        if (lastLen > 0) {
                        lastLen = next.length();
                derivedBindings.addBinding(query.getSnippetVariableName(), new LiteralImpl(result.toString()));
            } else {
                        "Lucene Query requests snippet, but no highlighter was generated for it, no snippets will be generated!\n{}",

    // the derived bindings are used to extend the results of the following evaluation (the results do not contain the given bindings)
    // the bindings given to the LuceneSail shall not be included in its results, so we add them here, but won't include them in the results
    QueryBindingSet evaluateBindings = new QueryBindingSet(this.bindings);

    // finally, evaluate the bindings against the underlying store
    try {
        if (derivedBindings != null) {
            this.nextBindingSets = this.sailConn.evaluate(query, derivedBindings, includeInferred);
    } catch (Exception e) {
        log.error("Provided sail connection could not evaluate tuple expression!", e);
        return false; // TODO: do we want to return true or false here?

    // go to the next permutation, if this was the last one,
    // invalidate the permutation instance, which will be check
    // at the beginning of the next call of the findNextBindingSets method
    if (this.permutations.next()) {

    // we succeeded
    return true;

From source file:org.openrdf.sail.lucene.LuceneSailConnection.java

License:BSD License

 * This method generates bindings from the given result of a Lucene query.
 * 
 *
 * @param query
 *        the Lucene query
 * @param hits
 *        the query result
 * @param highlighter
 *        a Highlighter for the query
 * @return a LinkedHashSet containing generated bindings
 * @throws SailException
private LinkedHashSet<BindingSet> generateBindingSets(QuerySpec query, TopDocs hits, Highlighter highlighter)
        throws SailException {
    // Since one resource can be returned many times, it can lead now to
    // multiple occurrences
    // of the same binding tuple in the BINDINGS clause. This in turn leads to
    // duplicate answers in the original SPARQL query.
    // We want to avoid this, so BindingSets added to the result must be
    // unique.
    LinkedHashSet<BindingSet> bindingSets = new LinkedHashSet<BindingSet>();

    // for each hit ...
    ScoreDoc[] docs = hits.scoreDocs;
    for (int i = 0; i < docs.length; i++) {
        // this takes the new bindings
        QueryBindingSet derivedBindings = new QueryBindingSet();

        // get the current hit
        int docId = docs[i].doc;
        Document doc = getDoc(docId);
        if (doc == null)

        // get the score of the hit
        float score = docs[i].score;

        // bind the respective variables
        String matchVar = query.getMatchesVariableName();
        if (matchVar != null) {
            try {
                Resource resource = this.luceneIndex.getResource(doc);
                Value existing = derivedBindings.getValue(matchVar);
                // if the existing binding contradicts the current binding, than
                // we can safely skip this permutation
                if ((existing != null) && (!existing.stringValue().equals(resource.stringValue()))) {
                    // invalidate the binding
                    derivedBindings = null;

                    // and exit the loop
                derivedBindings.addBinding(matchVar, resource);
            } catch (NullPointerException e) {
                SailException e1 = new SailException(
                        "NullPointerException when retrieving a resource from LuceneSail. Possible cause is the obsolete index structure. Re-creating the index can help",
                logger.debug("Details: ", e);
                throw e1;

        if ((query.getScoreVariableName() != null) && (score > 0.0f))
            derivedBindings.addBinding(query.getScoreVariableName(), scoreToLiteral(score));

        if (query.getSnippetVariableName() != null) {
            if (highlighter != null) {
                // limit to the queried field, if there was one
                Fieldable[] fields;
                if (query.getPropertyURI() != null) {
                    String fieldname = query.getPropertyURI().toString();
                    fields = doc.getFieldables(fieldname);
                } else {
                    fields = this.luceneIndex.getPropertyFields(doc.getFields());

                // extract snippets from Lucene's query results
                for (Fieldable field : fields) {
                    // create an individual binding set for each snippet
                    QueryBindingSet snippetBindings = new QueryBindingSet(derivedBindings);

                    String text = field.stringValue();
                    TokenStream tokenStream = this.luceneIndex.getAnalyzer().tokenStream(field.name(),
                            new StringReader(text));

                    String fragments = null;
                    try {
                        fragments = highlighter.getBestFragments(tokenStream, text, 2, "...");
                    } catch (Exception e) {
                        logger.error("Exception while getting snippet for filed " + field.name()
                                + " for query\n" + query, e);

                    if (fragments != null && !fragments.isEmpty()) {
                                new SimpleLiteral(fragments));

                        if (query.getPropertyVariableName() != null && query.getPropertyURI() == null) {
                                    new SimpleIRI(field.name()));

            } else {
                        "Lucene Query requests snippet, but no highlighter was generated for it, no snippets will be generated!\n{}",
        } else {

    // we succeeded
    return bindingSets;

From source file:org.openrdf.sail.lucene3.LuceneIndex.java

License:BSD License

public String getSnippet(String fieldName, String text, Highlighter highlighter) {
    String snippet;
    try {
        TokenStream tokenStream = getAnalyzer().tokenStream(fieldName, new StringReader(text));
        snippet = highlighter.getBestFragments(tokenStream, text, 2, "...");
    } catch (Exception e) {
        logger.error("Exception while getting snippet for field " + fieldName, e);
        snippet = null;
    return snippet;

From source file:org.paxle.se.index.lucene.impl.SnippetFetcher.java

License:Open Source License

public String getSnippet(Query query, String locationStr) {
    Reader textReader = null;
    try {
        // creating a dummy command
        URI locationURI = URI.create(locationStr);
        ICommand cmd = this.docFactory.createDocument(ICommand.class);

        // crawling the resource
        if (cmd.getResult() != Result.Passed)
            return null;

        // parsing the resource
        if (cmd.getResult() != Result.Passed)
            return null;

        // trying to get the parsed content
        IParserDocument pdoc = cmd.getParserDocument();
        if (pdoc == null)
            return null;
        else if (pdoc.getStatus() != Status.OK)
            return null;

        // getting the document content
        textReader = pdoc.getTextAsReader();
        if (textReader == null)
            return null;

        // reading some text
        StringBuilder text = new StringBuilder();
        this.ioTools.copy(textReader, text, 10240);

        final Highlighter highlighter = new Highlighter(new QueryScorer(query));
        final TokenStream tokenStream = this.analyzer.tokenStream("content", new StringReader(text.toString()));
        final String result = highlighter.getBestFragments(tokenStream, text.toString(), 3, "...");

        return result;
    } catch (Throwable e) {
        this.logger.error(e.getMessage(), e);
    } finally {
        // closing reader
        if (textReader != null) {
            try {
            } catch (Exception e) {
                this.logger.error(e.getMessage(), e);

    return null;

From source file:org.sakaiproject.search.component.service.impl.SearchResultImpl.java

License:Educational Community License

public String getSearchResult() {
    try {
        Scorer scorer = new QueryScorer(query);
        Highlighter hightlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), scorer);
        StringBuilder sb = new StringBuilder();
        // contents no longer contains the digested contents, so we need to
        // fetch it from the EntityContentProducer

        byte[][] references = doc.getBinaryValues(SearchService.FIELD_REFERENCE);
        DigestStorageUtil digestStorageUtil = new DigestStorageUtil(searchService);
        if (references != null && references.length > 0) {

            for (int i = 0; i < references.length; i++) {
                EntityContentProducer sep = searchIndexBuilder
                if (sep != null) {
                    //does this ecp store on the FS?
                    if (sep instanceof StoredDigestContentProducer) {
                        String digestCount = doc.get(SearchService.FIELD_DIGEST_COUNT);
                        if (digestCount == null) {
                            digestCount = "1";
                        log.debug("This file possibly has FS digests with index of " + digestCount);
                        StringBuilder sb1 = digestStorageUtil.getFileContents(CompressionTools.decompressString(
                                doc.getBinaryValue(SearchService.FIELD_REFERENCE)), digestCount);
                        if (sb1.length() > 0) {

                        } else {
                            String digest = sep.getContent(CompressionTools.decompressString(references[i]));
                            //we need to save this
                                    sb.toString(), 1);


                    } else {

        String text = sb.toString();
        TokenStream tokenStream = analyzer.tokenStream(SearchService.FIELD_CONTENTS, new StringReader(text));
        return hightlighter.getBestFragments(tokenStream, text, 5, " ... "); //$NON-NLS-1$
    } catch (IOException e) {
        return Messages.getString("SearchResultImpl.2") + e.getMessage(); //$NON-NLS-1$
    } catch (InvalidTokenOffsetsException e) {
        return Messages.getString("SearchResultResponseImpl.11") + e.getMessage();
    } catch (DataFormatException e) {
        return Messages.getString("SearchResultResponseImpl.11") + e.getMessage();