public final String getBestFragment(TokenStream tokenStream, String text)
        throws IOException, InvalidTokenOffsetsException 

Source Link


Highlights chosen terms in a text, extracting the most relevant section.


From source file:com.paladin.common.Tools.java

License:Apache License

 * @param _query
 * @param _field
 * @param _content
 * @return
public static String highlight(final Query _query, final String _field, final String _content) {
    Scorer scorer = new QueryScorer(_query);
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(Constants.HIGHLIGHT_STYLE, "</span>");
    Highlighter hl = new Highlighter(formatter, scorer);
    TokenStream tokens = new IKAnalyzer().tokenStream(_field, new StringReader(_content));
    try {
        return hl.getBestFragment(tokens, _content);
    } catch (IOException e) {
    } catch (InvalidTokenOffsetsException e) {
    return null;

From source file:com.searchlocal.lucene.ContentSearcher.java

License:Open Source License

 * @param param ?
 * @param indexlocal ?
 * @return list 
public static List<ResultBean> query(SearchParam param) throws IOException, LogicException {
    // ?
    String indexPath = param.getIndexPath();
    if (null == fsd) {
        fsd = SimpleFSDirectory.open(new File(indexPath));
    List<ResultBean> beanList = new ArrayList<ResultBean>();
    try {
        // ?KeyWord
        Analyzer analyzer = new PaodingAnalyzer();
        QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "content", analyzer);
        Query query;
        query = parser.parse(param.getKeyWord());

        TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
        // ?
        if (null == is) {
            is = new IndexSearcher(fsd, true);
        is.search(query, collector);

        ScoreDoc[] scoreDoc = collector.topDocs().scoreDocs;

        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");

        Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(CONTENTS_SHOW_LENGTH));

        if (scoreDoc.length == 0) {
            return beanList;
        int startRow = param.getStartRow();
        int endRow = param.getEndRow();
        endRow = scoreDoc.length > endRow ? endRow : scoreDoc.length;
        for (int i = startRow; i < endRow; i++) {
            Document doc = is.doc(scoreDoc[i].doc);
            String content = doc.get("content");
            TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content));
            content = highlighter.getBestFragment(tokenStream, content);
            ResultBean bean = BeanUtil.getBean(doc, content);
    } catch (IOException e) {
        // TODO Auto-generated catch block
    } catch (InvalidTokenOffsetsException e) {
        // TODO Auto-generated catch block
    } catch (ParseException e) {
    return beanList;

From source file:de.innovationgate.wga.server.api.Lucene.java

License:Open Source License

public List<String> highlightLuceneField(String field, String originalText, String prefix, String suffix)
        throws WGException {
    WGACore core = _wga.getCore();//from  w ww.  j  av  a2s .c om
    if (!core.isLuceneEnabled()) {
        _wga.getLog().warn("Unable to highlight text bc. lucene is not enabled.");
        return Collections.singletonList(originalText);
    // try to retrieve last lucene query for highlighting
    org.apache.lucene.search.Query query = (org.apache.lucene.search.Query) _wga.getRequest().getSession()
    if (query == null) {
        // no query in session - highlighting not possible
        return Collections.singletonList(originalText);

    // create htmlformatter to highlight fragments with "$HIGHLIGHT_PREFIX$", "$HIGHLIGHT_SUFFIX$"
    // these placeholders are later on replaced by the given prefix and suffix
    // this additional step is necessary to encode the fragment text properly
    String prefixPlaceholder = "$HIGHLIGHT_PREFIX$";
    String suffixPlaceholder = "$HIGHLIGHT_SUFFIX$";
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(prefixPlaceholder, suffixPlaceholder);

    // create highlighter
    Highlighter highlighter = core.getLuceneManager().createHighlighter(field, query, formatter);

    // create tokenstream
    TokenStream tokenStream = core.getLuceneManager().createTokenStream(originalText, _cx.content());

    // create fragmenter and set fragmentsize to metaText.length to ensure only one fragments with the whole metaText is returned        
    Fragmenter fragmenter = new SimpleFragmenter(originalText.length() + 1); // +1 is necessary here 

    try {
        String highlighted = highlighter.getBestFragment(tokenStream, originalText);
        if (highlighted != null) {

            // replace highlight placeholders with correct prefix and suffix
            highlighted = WGUtils.strReplace(highlighted, prefixPlaceholder, prefix, true);
            highlighted = WGUtils.strReplace(highlighted, suffixPlaceholder, suffix, true);

            return Collections.singletonList(highlighted);
        } else {
            return Collections.singletonList(originalText);
    } catch (IOException e) {
        _wga.getLog().warn("Unable to highlight text bc. of exception '" + e.getMessage() + "'.");
        return Collections.singletonList(originalText);
    } catch (InvalidTokenOffsetsException e) {
        _wga.getLog().warn("Unable to highlight meta text bc. of exception '" + e.getMessage() + "'.");
        return Collections.singletonList(originalText);


From source file:de.innovationgate.wgpublisher.webtml.utils.TMLContext.java

License:Open Source License

public String highlightitem(String name, String prefix, String suffix, String encode) throws WGAPIException {
    if (name == null) {
        return null;
    }//from   ww w  . ja va  2  s  .co m

    // lowercase name
    name = name.toLowerCase();

    // retrieve itemtext
    String originalText = itemTextValue(name, encode);
    if (originalText == null) {
        return null;

    if (!getwgacore().isLuceneEnabled()) {
        addwarning("Unable to highlight item '" + name + "' bc. lucene is not enabled.");
        return originalText;
    // try to retrieve last lucene query for highlighting
    org.apache.lucene.search.Query query = (org.apache.lucene.search.Query) getrequest().getSession()
    if (query == null) {
        // no query in session - highlighting not possible
                "Lucene highlighting not possible because there is no query with enabled highlighting support");
        return originalText;

    // create htmlformatter to highlight fragments with "$HIGHLIGHT_PREFIX$", "$HIGHLIGHT_SUFFIX$"
    // these placeholders are later on replaced by the given prefix and suffix
    // this additional step is necessary to encode the fragment text properly
    // see F00004C66
    String prefixPlaceholder = "$HIGHLIGHT_PREFIX$";
    String suffixPlaceholder = "$HIGHLIGHT_SUFFIX$";
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(prefixPlaceholder, suffixPlaceholder);

    // create highlighter
    Highlighter highlighter = getwgacore().getLuceneManager().createHighlighter(name, query, formatter);

    // create text to analyze
    LuceneConfiguration config = getwgacore().getLuceneManager()
    LuceneIndexItemRule rule = config.getMatchingItemRule(name);
    String analyzeText = rule.parseItemValue(originalText);

    // create tokenstream
    TokenStream tokenStream = getwgacore().getLuceneManager().createTokenStream(analyzeText, content());

    // create fragmenter and set fragmentsize to itemText.length to ensure only one fragments with the whole itemText is returned        
    Fragmenter fragmenter = new SimpleFragmenter(originalText.length() + 1); // if analyzeText.length == originalText.length we might get two fragments from lucene without the +1 (possible lucene bug)

    try {
        String highlighted = highlighter.getBestFragment(tokenStream, originalText);
        if (highlighted != null) {
            // replace highlight placeholders with correct prefix and suffix
            highlighted = WGUtils.strReplace(highlighted, prefixPlaceholder, prefix, true);
            highlighted = WGUtils.strReplace(highlighted, suffixPlaceholder, suffix, true);

            return highlighted;
    } catch (IOException e) {
        addwarning("Unable to highlight item '" + name + "' bc. of exception '" + e.getMessage() + "'.");
    } catch (InvalidTokenOffsetsException e) {
        addwarning("Unable to highlight item '" + name + "' bc. of exception '" + e.getMessage() + "'.");

    return originalText;


From source file:de.innovationgate.wgpublisher.webtml.utils.TMLContext.java

License:Open Source License

 * returns a singleton list with metavalues highlighted (surrounded with given <prefix> and <suffix>) based uppon the last lucene query with highlight attribute set to true
 * if highlighting is not possible this method returns metalist(<name>);
 * @param name/*  w  w w  .j av a 2  s .  c  o m*/
 * @param prefix
 * @param suffix
 * @param encode
 * @return list 
 * @throws WGAPIException
public List highlightMeta(String name, String prefix, String suffix, String encode) throws WGAPIException {
    if (name == null) {
        return null;

    String originalText = metaTextValue(name, encode);
    List<String> originalTextAsList = Collections.singletonList(originalText);

    if (!getwgacore().isLuceneEnabled()) {
        addwarning("Unable to highlight meta '" + name + "' bc. lucene is not enabled.");
        return originalTextAsList;
    // try to retrieve last lucene query for highlighting
    org.apache.lucene.search.Query query = (org.apache.lucene.search.Query) getrequest().getSession()
    if (query == null) {
        // no query in session - highlighting not possible
        return originalTextAsList;

    // create htmlformatter to highlight fragments with "$HIGHLIGHT_PREFIX$", "$HIGHLIGHT_SUFFIX$"
    // these placeholders are later on replaced by the given prefix and suffix
    // this additional step is necessary to encode the fragment text properly
    String prefixPlaceholder = "$HIGHLIGHT_PREFIX$";
    String suffixPlaceholder = "$HIGHLIGHT_SUFFIX$";
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(prefixPlaceholder, suffixPlaceholder);

    // create highlighter
    Highlighter highlighter = getwgacore().getLuceneManager().createHighlighter(name.toUpperCase(), query,

    // create tokenstream
    TokenStream tokenStream = getwgacore().getLuceneManager().createTokenStream(originalText, content());

    // create fragmenter and set fragmentsize to metaText.length to ensure only one fragments with the whole metaText is returned        
    Fragmenter fragmenter = new SimpleFragmenter(originalText.length() + 1); // +1 is necessary here 

    try {
        String highlighted = highlighter.getBestFragment(tokenStream, originalText);
        if (highlighted != null) {

            // replace highlight placeholders with correct prefix and suffix
            highlighted = WGUtils.strReplace(highlighted, prefixPlaceholder, prefix, true);
            highlighted = WGUtils.strReplace(highlighted, suffixPlaceholder, suffix, true);

            return Collections.singletonList(highlighted);
        } else {
            return originalTextAsList;
    } catch (IOException e) {
        addwarning("Unable to highlight meta '" + name + "' bc. of exception '" + e.getMessage() + "'.");
        return originalTextAsList;
    } catch (InvalidTokenOffsetsException e) {
        addwarning("Unable to highlight meta '" + name + "' bc. of exception '" + e.getMessage() + "'.");
        return originalTextAsList;


From source file:de.spartusch.nasfvi.server.NSearcher.java

License:Apache License

 * Extracts a field's values from a document. This method is aware of
 * <i>collapsed</i> or <i>merged</i> fields and handles them properly. 
 * @param nquery NQuery used for searching
 * @param doc Document to extract the field's values from
 * @param field Name of the field to extract values for
 * @return Set of extracted values/*from w  w  w . j av  a 2  s.com*/
private Set<String> extractValues(final NQuery nquery, final Document doc, final String field) {
    Set<String> values = new HashSet<String>();

    if (NQuery.isFieldToCollapse(field)) {
        // process merged field
        String mfield = NQuery.getMergedField();
        QueryScorer scorer = new QueryScorer(nquery.getQuery(), mfield);
        Highlighter highlighter = new Highlighter(scorer);
        highlighter.setTextFragmenter(new NullFragmenter());

        try {
            Set<String> buffer = new HashSet<String>();

            for (Fieldable f : doc.getFieldables(mfield)) {
                String content = f.stringValue();
                String value = normalizeValue(NQuery.extractValue(field, content));

                // Test if the field was matched by the query
                TokenStream ts = TokenSources.getTokenStream(mfield, content, nquery.getAnalyzer());
                if (highlighter.getBestFragment(ts, content) != null) {
                } else {
                    // Buffer the value - in case no field matches

            if (values.isEmpty()) {
                // No field was matched by the query
        } catch (IOException e) {
            throw new RuntimeException(e);
        } catch (InvalidTokenOffsetsException e) {
            throw new RuntimeException(e);
    } else {
        for (String v : doc.getValues(field)) {

    return values;

From source file:io.jpress.module.article.searcher.LuceneSearcher.java


private List<Article> toArticleList(IndexSearcher searcher, TopDocs topDocs, Highlighter highlighter,
        String keyword) throws IOException {
    List<Article> articles = new ArrayList<>();
    Analyzer analyzer = new JcsegAnalyzer(JcsegTaskConfig.COMPLEX_MODE);
    for (ScoreDoc item : topDocs.scoreDocs) {
        Document doc = searcher.doc(item.doc);
        Article article = new Article();
        String title = doc.get("title");
        String content = doc.get("content");
        article.setTitle(title);//  ww  w  . ja v a 2 s .  c  om
        try {
            String highlightTitle = highlighter
                    .getBestFragment(analyzer.tokenStream(keyword, new StringReader(title)), title);
            String text = article.getText();
            String highlightContent = highlighter
                    .getBestFragment(analyzer.tokenStream(keyword, new StringReader(text)), text);
        } catch (InvalidTokenOffsetsException e) {
            logger.error(e.getMessage(), e);
    return articles;

From source file:lius.search.LiusHitList.java

License:Apache License

private LiusHit buildLiusHit(int index) throws IOException {

    LiusHit liusHit = new LiusHit();

    Document luceneDocument = luceneHits.doc(index);

    Map liusHitFieldsMap = new HashMap();
    List liusFieldsList = new ArrayList();
    Highlighter luceneHighlighter = null;

    if (liusConfig.getHighlighter() == true) {
        IndexReader luceneIndexReader = IndexReader.open(indexDirectory);

        Query rewrittenLuceneQuery = luceneQuery.rewrite(luceneIndexReader);
        QueryScorer luceneScorer = new QueryScorer(rewrittenLuceneQuery);

        SimpleHTMLFormatter luceneFormatter = new SimpleHTMLFormatter("<span class=\"liusHit\">", "</span>");
        luceneHighlighter = new Highlighter(luceneFormatter, luceneScorer);
    }/*from   ww w  . j a va 2 s .c  o  m*/

    for (int j = 0; j < liusConfig.getDisplayFields().size(); j++) {
        LiusField configLiusField = (LiusField) liusConfig.getDisplayFields().get(j);
        LiusField hitLiusField = new LiusField();
        String fieldName = configLiusField.getName();


        if (luceneHighlighter != null) {
            Fragmenter luceneFragmenter;
            if (configLiusField.getFragmenter() != null) {
                luceneFragmenter = new SimpleFragmenter(Integer.parseInt(configLiusField.getFragmenter()));
            } else {
                luceneFragmenter = new SimpleFragmenter(Integer.MAX_VALUE);
        String[] luceneDocumentValues = luceneDocument.getValues(configLiusField.getName());
        if (luceneDocumentValues != null) {
            if (luceneHighlighter != null) {
                for (int k = 0; k < luceneDocumentValues.length; k++) {
                    Analyzer luceneAnalyzer = AnalyzerFactory.getAnalyzer(liusConfig);
                    TokenStream luceneTokenStream = luceneAnalyzer.tokenStream(configLiusField.getName(),
                            new StringReader(luceneDocumentValues[k]));
                    String fragment = null;
                    if (configLiusField.getFragmenter() != null)
                        fragment = luceneHighlighter.getBestFragments(luceneTokenStream,
                                luceneDocumentValues[k], 5, "...");
                    else {
                        fragment = luceneHighlighter.getBestFragment(luceneTokenStream,

                    if (fragment == null) {
                    } else {
                        luceneDocumentValues[k] = fragment;


            liusHitFieldsMap.put(configLiusField.getName(), hitLiusField);

    return liusHit;

From source file:lucandra.LucandraTests.java

License:Apache License

public void testHighlight() throws Exception {

    // This tests the TermPositionVector classes

    IndexReader indexReader = new IndexReader(indexName, client);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "key", analyzer);

    // check exact
    Query q = qp.parse("+key:\"foobar foobar\"");
    TopDocs docs = searcher.search(q, 10);
    assertEquals(1, docs.totalHits);/* ww w.ja  v a  2s  .c o m*/

    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
    QueryScorer scorer = new QueryScorer(q, "key", text);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));

    TokenStream tvStream = TokenSources.getTokenStream(indexReader, docs.scoreDocs[0].doc, "key");

    String rv = highlighter.getBestFragment(tvStream, text);

    assertEquals(rv, highlightedText);

From source file:net.paoding.analysis.TestPaodingAnalyzer.java

License:Apache License

public void testHighlighting() throws Exception {

    Analyzer a = new PaodingAnalyzer();
    QueryParser parser = new QueryParser(Version.LUCENE_46, "f", a);

    Query q = parser.parse("domnick");
    String txt = "Domnick Hunter 0.01m , ?OIL-X Plus";

    Highlighter highlighter = new Highlighter(new QueryScorer(q));
    String resp = highlighter.getBestFragment(a.tokenStream("f", txt), txt);

    assertTrue(resp + " is not correctly highlighted", resp.contains("<B>Domnick</B>"));
