org.apache.lucene.search.highlight.Highlighter.java Source code

Introduction

Here is the source code for org.apache.lucene.search.highlight.Highlighter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.highlight;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Objects;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.PriorityQueue;

/**
 * Marks up highlighted terms found in the best sections of
 * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
 * {@link Encoder} and tokenizers.
 *
 * This is Lucene's original Highlighter; there are others.
 */
public class Highlighter {
    public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50 * 1024;

    private Formatter formatter;
    private Encoder encoder;
    private Scorer fragmentScorer;
    private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
    private Fragmenter textFragmenter = new SimpleFragmenter();

    public Highlighter(Scorer fragmentScorer) {
        this(new SimpleHTMLFormatter(), fragmentScorer);
    }

    public Highlighter(Formatter formatter, Scorer fragmentScorer) {
        this(formatter, new DefaultEncoder(), fragmentScorer);
    }

    public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) {
        ensureArgumentNotNull(formatter, "'formatter' must not be null");
        ensureArgumentNotNull(encoder, "'encoder' must not be null");
        ensureArgumentNotNull(fragmentScorer, "'fragmentScorer' must not be null");

        this.formatter = formatter;
        this.encoder = encoder;
        this.fragmentScorer = fragmentScorer;
    }

    /**
     * Highlights chosen terms in a text, extracting the most relevant section.
     * This is a convenience method that calls
     * {@link #getBestFragment(TokenStream, String)}
     *
     * @param analyzer   the analyzer that will be used to split <code>text</code>
     * into chunks
     * @param text text to highlight terms in
     * @param fieldName Name of field used to influence analyzer's tokenization policy
     *
     * @return highlighted text fragment or null if no terms found
     * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
     */
    public final String getBestFragment(Analyzer analyzer, String fieldName, String text)
            throws IOException, InvalidTokenOffsetsException {
        TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
        return getBestFragment(tokenStream, text);
    }

    /**
     * Highlights chosen terms in a text, extracting the most relevant section.
     * The document text is analysed in chunks to record hit statistics
     * across the document. After accumulating stats, the fragment with the highest score
     * is returned
     *
     * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
     * This is typically produced by an analyzer re-parsing a document's
     * text. Some work may be done on retrieving TokenStreams more efficiently
     * by adding support for storing original text position data in the Lucene
     * index but this support is not currently available (as of Lucene 1.4 rc2).
     * @param text text to highlight terms in
     *
     * @return highlighted text fragment or null if no terms found
     * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
     */
    public final String getBestFragment(TokenStream tokenStream, String text)
            throws IOException, InvalidTokenOffsetsException {
        String[] results = getBestFragments(tokenStream, text, 1);
        if (results.length > 0) {
            return results[0];
        }
        return null;
    }

    /**
     * Highlights chosen terms in a text, extracting the most relevant sections.
     * This is a convenience method that calls
     * {@link #getBestFragments(TokenStream, String, int)}
     *
     * @param analyzer   the analyzer that will be used to split <code>text</code>
     * into chunks
     * @param fieldName     the name of the field being highlighted (used by analyzer)
     * @param text          text to highlight terms in
     * @param maxNumFragments  the maximum number of fragments.
     *
     * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
     * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
     */
    public final String[] getBestFragments(Analyzer analyzer, String fieldName, String text, int maxNumFragments)
            throws IOException, InvalidTokenOffsetsException {
        TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
        return getBestFragments(tokenStream, text, maxNumFragments);
    }

    /**
     * Highlights chosen terms in a text, extracting the most relevant sections.
     * The document text is analysed in chunks to record hit statistics
     * across the document. After accumulating stats, the fragments with the highest scores
     * are returned as an array of strings in order of score (contiguous fragments are merged into
     * one in their original order to improve readability)
     *
     * @param text          text to highlight terms in
     * @param maxNumFragments  the maximum number of fragments.
     *
     * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
     * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
     */
    public final String[] getBestFragments(TokenStream tokenStream, String text, int maxNumFragments)
            throws IOException, InvalidTokenOffsetsException {
        maxNumFragments = Math.max(1, maxNumFragments); //sanity check

        TextFragment[] frag = getBestTextFragments(tokenStream, text, true, maxNumFragments);

        //Get text
        ArrayList<String> fragTexts = new ArrayList<>();
        for (int i = 0; i < frag.length; i++) {
            if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                fragTexts.add(frag[i].toString());
            }
        }
        return fragTexts.toArray(new String[0]);
    }

    /**
     * Low level api to get the most relevant (formatted) sections of the document.
     * This method has been made public to allow visibility of score information held in TextFragment objects.
     * Thanks to Jason Calabrese for help in redefining the interface.
     * @throws IOException If there is a low-level I/O error
     * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
     */
    public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text,
            boolean mergeContiguousFragments, int maxNumFragments)
            throws IOException, InvalidTokenOffsetsException {
        ArrayList<TextFragment> docFrags = new ArrayList<>();
        StringBuilder newText = new StringBuilder();

        CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
        TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

        if (fragmentScorer instanceof QueryScorer) {
            ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
        }

        TokenStream newStream = fragmentScorer.init(tokenStream);
        if (newStream != null) {
            tokenStream = newStream;
        }
        fragmentScorer.startFragment(currentFrag);
        docFrags.add(currentFrag);

        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

        try {

            String tokenText;
            int startOffset;
            int endOffset;
            int lastEndOffset = 0;
            textFragmenter.start(text, tokenStream);

            TokenGroup tokenGroup = new TokenGroup(tokenStream);

            tokenStream.reset();
            for (boolean next = tokenStream.incrementToken(); next
                    && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
                if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
                            + " exceeds length of provided text sized " + text.length());
                }
                if ((tokenGroup.getNumTokens() > 0) && (tokenGroup.isDistinct())) {
                    //the current token is distinct from previous tokens -
                    // markup the cached token group info
                    startOffset = tokenGroup.getStartOffset();
                    endOffset = tokenGroup.getEndOffset();
                    tokenText = text.substring(startOffset, endOffset);
                    String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                    //store any whitespace etc from between this and last group
                    if (startOffset > lastEndOffset)
                        newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                    newText.append(markedUpText);
                    lastEndOffset = Math.max(endOffset, lastEndOffset);
                    tokenGroup.clear();

                    //check if current token marks the start of a new fragment
                    if (textFragmenter.isNewFragment()) {
                        currentFrag.setScore(fragmentScorer.getFragmentScore());
                        //record stats for a new fragment
                        currentFrag.textEndPos = newText.length();
                        currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                        fragmentScorer.startFragment(currentFrag);
                        docFrags.add(currentFrag);
                    }
                }

                tokenGroup.addToken(fragmentScorer.getTokenScore());

                //        if(lastEndOffset>maxDocBytesToAnalyze)
                //        {
                //          break;
                //        }
            }
            currentFrag.setScore(fragmentScorer.getFragmentScore());

            if (tokenGroup.getNumTokens() > 0) {
                //flush the accumulated text (same code as in above loop)
                startOffset = tokenGroup.getStartOffset();
                endOffset = tokenGroup.getEndOffset();
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                //store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset = Math.max(lastEndOffset, endOffset);
            }

            //Test what remains of the original text beyond the point where we stopped analyzing
            if (
            //          if there is text beyond the last token considered..
            (lastEndOffset < text.length()) &&
            //          and that text is not too large...
                    (text.length() <= maxDocCharsToAnalyze)) {
                //append it to the last fragment
                newText.append(encoder.encodeText(text.substring(lastEndOffset)));
            }

            currentFrag.textEndPos = newText.length();

            //sort the most relevant sections of the text
            for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) {
                currentFrag = i.next();

                //If you are running with a version of Lucene before 11th Sept 03
                // you do not have PriorityQueue.insert() - so uncomment the code below
                /*
                          if (currentFrag.getScore() >= minScore)
                          {
                fragQueue.put(currentFrag);
                if (fragQueue.size() > maxNumFragments)
                { // if hit queue overfull
                  fragQueue.pop(); // remove lowest in hit queue
                  minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                }
                    
                    
                          }
                */
                //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
                //fix to PriorityQueue. The correct method to use here is the new "insert" method
                // USE ABOVE CODE IF THIS DOES NOT COMPILE!
                fragQueue.insertWithOverflow(currentFrag);
            }

            //return the most relevant fragments
            TextFragment frag[] = new TextFragment[fragQueue.size()];
            for (int i = frag.length - 1; i >= 0; i--) {
                frag[i] = fragQueue.pop();
            }

            //merge any contiguous fragments to improve readability
            if (mergeContiguousFragments) {
                mergeContiguousFragments(frag);
                ArrayList<TextFragment> fragTexts = new ArrayList<>();
                for (int i = 0; i < frag.length; i++) {
                    if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                        fragTexts.add(frag[i]);
                    }
                }
                frag = fragTexts.toArray(new TextFragment[0]);
            }

            return frag;

        } finally {
            if (tokenStream != null) {
                try {
                    tokenStream.end();
                    tokenStream.close();
                } catch (Exception e) {
                }
            }
        }
    }

    /** Improves readability of a score-sorted list of TextFragments by merging any fragments
     * that were contiguous in the original text into one larger fragment with the correct order.
     * This will leave a "null" in the array entry for the lesser scored fragment.
     *
     * @param frag An array of document fragments in descending score
     */
    private void mergeContiguousFragments(TextFragment[] frag) {
        boolean mergingStillBeingDone;
        if (frag.length > 1)
            do {
                mergingStillBeingDone = false; //initialise loop control flag
                //for each fragment, scan other frags looking for contiguous blocks
                for (int i = 0; i < frag.length; i++) {
                    if (frag[i] == null) {
                        continue;
                    }
                    //merge any contiguous blocks
                    for (int x = 0; x < frag.length; x++) {
                        if (frag[x] == null) {
                            continue;
                        }
                        if (frag[i] == null) {
                            break;
                        }
                        TextFragment frag1 = null;
                        TextFragment frag2 = null;
                        int frag1Num = 0;
                        int frag2Num = 0;
                        int bestScoringFragNum;
                        int worstScoringFragNum;
                        //if blocks are contiguous....
                        if (frag[i].follows(frag[x])) {
                            frag1 = frag[x];
                            frag1Num = x;
                            frag2 = frag[i];
                            frag2Num = i;
                        } else if (frag[x].follows(frag[i])) {
                            frag1 = frag[i];
                            frag1Num = i;
                            frag2 = frag[x];
                            frag2Num = x;
                        }
                        //merging required..
                        if (frag1 != null) {
                            if (frag1.getScore() > frag2.getScore()) {
                                bestScoringFragNum = frag1Num;
                                worstScoringFragNum = frag2Num;
                            } else {
                                bestScoringFragNum = frag2Num;
                                worstScoringFragNum = frag1Num;
                            }
                            frag1.merge(frag2);
                            frag[worstScoringFragNum] = null;
                            mergingStillBeingDone = true;
                            frag[bestScoringFragNum] = frag1;
                        }
                    }
                }
            } while (mergingStillBeingDone);
    }

    /**
     * Highlights terms in the  text , extracting the most relevant sections
     * and concatenating the chosen fragments with a separator (typically "...").
     * The document text is analysed in chunks to record hit statistics
     * across the document. After accumulating stats, the fragments with the highest scores
     * are returned in order as "separator" delimited strings.
     *
     * @param text        text to highlight terms in
     * @param maxNumFragments  the maximum number of fragments.
     * @param separator  the separator used to intersperse the document fragments (typically "...")
     *
     * @return highlighted text
     * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
     */
    public final String getBestFragments(TokenStream tokenStream, String text, int maxNumFragments,
            String separator) throws IOException, InvalidTokenOffsetsException {
        String sections[] = getBestFragments(tokenStream, text, maxNumFragments);
        StringBuilder result = new StringBuilder();
        for (int i = 0; i < sections.length; i++) {
            if (i > 0) {
                result.append(separator);
            }
            result.append(sections[i]);
        }
        return result.toString();
    }

    public int getMaxDocCharsToAnalyze() {
        return maxDocCharsToAnalyze;
    }

    public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
        this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
    }

    public Fragmenter getTextFragmenter() {
        return textFragmenter;
    }

    public void setTextFragmenter(Fragmenter fragmenter) {
        textFragmenter = Objects.requireNonNull(fragmenter);
    }

    /**
     * @return Object used to score each text fragment
     */
    public Scorer getFragmentScorer() {
        return fragmentScorer;
    }

    public void setFragmentScorer(Scorer scorer) {
        fragmentScorer = Objects.requireNonNull(scorer);
    }

    public Encoder getEncoder() {
        return encoder;
    }

    public void setEncoder(Encoder encoder) {
        this.encoder = Objects.requireNonNull(encoder);
    }

    /**
     * Throws an IllegalArgumentException with the provided message if 'argument' is null.
     *
     * @param argument the argument to be null-checked
     * @param message  the message of the exception thrown if argument == null
     */
    private static void ensureArgumentNotNull(Object argument, String message) {
        if (argument == null) {
            throw new IllegalArgumentException(message);
        }
    }

    static class FragmentQueue extends PriorityQueue<TextFragment> {
        FragmentQueue(int size) {
            super(size);
        }

        @Override
        public final boolean lessThan(TextFragment fragA, TextFragment fragB) {
            if (fragA.getScore() == fragB.getScore())
                return fragA.fragNum > fragB.fragNum;
            else
                return fragA.getScore() < fragB.getScore();
        }
    }
}