Example usage for org.apache.lucene.search.highlight Highlighter getBestFragments

List of usage examples for org.apache.lucene.search.highlight Highlighter getBestFragments

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter getBestFragments.

Prototype

public final String getBestFragments(TokenStream tokenStream, String text, int maxNumFragments,
        String separator) throws IOException, InvalidTokenOffsetsException 

Source Link

Document

Highlights terms in the text , extracting the most relevant sections and concatenating the chosen fragments with a separator (typically "...").

Usage

From source file:com.gitblit.LuceneExecutor.java

License:Apache License

/**
 * /*www .  j  av  a 2 s.  co  m*/
 * @param analyzer
 * @param query
 * @param content
 * @param result
 * @return
 * @throws IOException
 * @throws InvalidTokenOffsetsException
 */
private String getHighlightedFragment(Analyzer analyzer, Query query, String content, SearchResult result)
        throws IOException, InvalidTokenOffsetsException {
    if (content == null) {
        content = "";
    }

    int fragmentLength = SearchObjectType.commit == result.type ? 512 : 150;

    QueryScorer scorer = new QueryScorer(query, "content");
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentLength);

    // use an artificial delimiter for the token
    String termTag = "!!--[";
    String termTagEnd = "]--!!";
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(termTag, termTagEnd);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(fragmenter);

    String[] fragments = highlighter.getBestFragments(analyzer, "content", content, 3);
    if (ArrayUtils.isEmpty(fragments)) {
        if (SearchObjectType.blob == result.type) {
            return "";
        }
        // clip commit message
        String fragment = content;
        if (fragment.length() > fragmentLength) {
            fragment = fragment.substring(0, fragmentLength) + "...";
        }
        return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true) + "</pre>";
    }

    // make sure we have unique fragments
    Set<String> uniqueFragments = new LinkedHashSet<String>();
    for (String fragment : fragments) {
        uniqueFragments.add(fragment);
    }
    fragments = uniqueFragments.toArray(new String[uniqueFragments.size()]);

    StringBuilder sb = new StringBuilder();
    for (int i = 0, len = fragments.length; i < len; i++) {
        String fragment = fragments[i];
        String tag = "<pre class=\"text\">";

        // resurrect the raw fragment from removing the artificial delimiters
        String raw = fragment.replace(termTag, "").replace(termTagEnd, "");

        // determine position of the raw fragment in the content
        int pos = content.indexOf(raw);

        // restore complete first line of fragment
        int c = pos;
        while (c > 0) {
            c--;
            if (content.charAt(c) == '\n') {
                break;
            }
        }
        if (c > 0) {
            // inject leading chunk of first fragment line
            fragment = content.substring(c + 1, pos) + fragment;
        }

        if (SearchObjectType.blob == result.type) {
            // count lines as offset into the content for this fragment
            int line = Math.max(1, StringUtils.countLines(content.substring(0, pos)));

            // create fragment tag with line number and language
            String lang = "";
            String ext = StringUtils.getFileExtension(result.path).toLowerCase();
            if (!StringUtils.isEmpty(ext)) {
                // maintain leading space!
                lang = " lang-" + ext;
            }
            tag = MessageFormat.format("<pre class=\"prettyprint linenums:{0,number,0}{1}\">", line, lang);

        }

        sb.append(tag);

        // replace the artificial delimiter with html tags
        String html = StringUtils.escapeForHtml(fragment, false);
        html = html.replace(termTag, "<span class=\"highlight\">").replace(termTagEnd, "</span>");
        sb.append(html);
        sb.append("</pre>");
        if (i < len - 1) {
            sb.append("<span class=\"ellipses\">...</span><br/>");
        }
    }
    return sb.toString();
}

From source file:com.gitblit.service.LuceneService.java

License:Apache License

/**
 *
 * @param analyzer//from www .  ja  v a 2s.c  o m
 * @param query
 * @param content
 * @param result
 * @return
 * @throws IOException
 * @throws InvalidTokenOffsetsException
 */
private String getHighlightedFragment(Analyzer analyzer, Query query, String content, SearchResult result)
        throws IOException, InvalidTokenOffsetsException {
    if (content == null) {
        content = "";
    }

    int tabLength = storedSettings.getInteger(Keys.web.tabLength, 4);
    int fragmentLength = SearchObjectType.commit == result.type ? 512 : 150;

    QueryScorer scorer = new QueryScorer(query, "content");
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentLength);

    // use an artificial delimiter for the token
    String termTag = "!!--[";
    String termTagEnd = "]--!!";
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(termTag, termTagEnd);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(fragmenter);

    String[] fragments = highlighter.getBestFragments(analyzer, "content", content, 3);
    if (ArrayUtils.isEmpty(fragments)) {
        if (SearchObjectType.blob == result.type) {
            return "";
        }
        // clip commit message
        String fragment = content;
        if (fragment.length() > fragmentLength) {
            fragment = fragment.substring(0, fragmentLength) + "...";
        }
        return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true, tabLength) + "</pre>";
    }

    // make sure we have unique fragments
    Set<String> uniqueFragments = new LinkedHashSet<String>();
    for (String fragment : fragments) {
        uniqueFragments.add(fragment);
    }
    fragments = uniqueFragments.toArray(new String[uniqueFragments.size()]);

    StringBuilder sb = new StringBuilder();
    for (int i = 0, len = fragments.length; i < len; i++) {
        String fragment = fragments[i];
        String tag = "<pre class=\"text\">";

        // resurrect the raw fragment from removing the artificial delimiters
        String raw = fragment.replace(termTag, "").replace(termTagEnd, "");

        // determine position of the raw fragment in the content
        int pos = content.indexOf(raw);

        // restore complete first line of fragment
        int c = pos;
        while (c > 0) {
            c--;
            if (content.charAt(c) == '\n') {
                break;
            }
        }
        if (c > 0) {
            // inject leading chunk of first fragment line
            fragment = content.substring(c + 1, pos) + fragment;
        }

        if (SearchObjectType.blob == result.type) {
            // count lines as offset into the content for this fragment
            int line = Math.max(1, StringUtils.countLines(content.substring(0, pos)));

            // create fragment tag with line number and language
            String lang = "";
            String ext = StringUtils.getFileExtension(result.path).toLowerCase();
            if (!StringUtils.isEmpty(ext)) {
                // maintain leading space!
                lang = " lang-" + ext;
            }
            tag = MessageFormat.format("<pre class=\"prettyprint linenums:{0,number,0}{1}\">", line, lang);

        }

        sb.append(tag);

        // replace the artificial delimiter with html tags
        String html = StringUtils.escapeForHtml(fragment, false);
        html = html.replace(termTag, "<span class=\"highlight\">").replace(termTagEnd, "</span>");
        sb.append(html);
        sb.append("</pre>");
        if (i < len - 1) {
            sb.append("<span class=\"ellipses\">...</span><br/>");
        }
    }
    return sb.toString();
}

From source file:com.knowledgetree.indexer.IndexerManager.java

/**
 * Returns a set of hits from lucene./*from   ww w.j  ava2 s  .c o  m*/
 * @param queryString
 * @param maxHits
 * @return
 * @throws Exception
 */
public QueryHit[] query(String queryString, int maxHits, boolean getText) throws Exception {
    synchronized (this) {
        this.queryCount++;
    }

    String tmp = queryString.toLowerCase();
    boolean queryContent = tmp.indexOf("content") != -1;
    boolean queryDiscussion = tmp.indexOf("discussion") != -1;

    QueryParser parser = new QueryParser("Content", this.analyzer);
    Query query = parser.parse(queryString);

    // rewriting is important for complex queries. this is a must-do according to sources!
    query = query.rewrite(this.queryReader);

    // run the search!
    Hits hits = this.querySearcher.search(query);

    // now we can apply the maximum hits to the results we return!
    int max = (maxHits == -1) ? hits.length() : maxHits;

    if (hits.length() < max) {
        max = hits.length();
    }

    QueryHit[] results = new QueryHit[max];

    Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
    highlighter.setTextFragmenter(new SimpleFragmenter(this.resultFragmentSize));
    for (int i = 0; i < max; i++) {
        Document doc = hits.doc(i);

        QueryHit hit = new QueryHit();
        hit.DocumentID = IndexerManager.stringToLong(doc.get("DocumentID"));
        hit.Rank = hits.score(i);
        hit.Title = doc.get("Title");
        if (getText) {
            String text = "";
            if (queryContent) {
                text += doc.get("Content");
            }
            if (queryDiscussion) {
                text += doc.get("Discussion");
            }

            // TODO: we can create a field.getReader(). the fragmenting needs to
            // be updated to deal with the reader only. would prefer not having to
            // load the document into a string!
            TokenStream tokenStream = analyzer.tokenStream("contents", new StringReader(text));

            hit.Content = highlighter.getBestFragments(tokenStream, text, this.resultFragments,
                    this.resultSeperator);
        } else {
            hit.Content = "";
        }

        hit.Version = doc.get("Version");

        results[i] = hit;
    }

    return results;
}

From source file:com.leavesfly.lia.tool.HighlightIt.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        System.err.println("Usage: HighlightIt <filename-out>");
        System.exit(-1);/*from ww  w  .  jav a  2  s .c  om*/
    }

    String filename = args[0];

    String searchText = "term"; // #1
    QueryParser parser = new QueryParser(Version.LUCENE_30, // #1
            "f", // #1
            new StandardAnalyzer(Version.LUCENE_30));// #1
    Query query = parser.parse(searchText); // #1

    SimpleHTMLFormatter formatter = // #2
            new SimpleHTMLFormatter("<span class=\"highlight\">", // #2
                    "</span>"); // #2

    TokenStream tokens = new StandardAnalyzer(Version.LUCENE_30) // #3
            .tokenStream("f", new StringReader(text)); // #3

    QueryScorer scorer = new QueryScorer(query, "f"); // #4

    Highlighter highlighter = new Highlighter(formatter, scorer); // #5
    highlighter.setTextFragmenter( // #6
            new SimpleSpanFragmenter(scorer)); // #6

    String result = // #7
            highlighter.getBestFragments(tokens, text, 3, "..."); // #7

    FileWriter writer = new FileWriter(filename); // #8
    writer.write("<html>"); // #8
    writer.write("<style>\n" + // #8
            ".highlight {\n" + // #8
            " background: yellow;\n" + // #8
            "}\n" + // #8
            "</style>"); // #8
    writer.write("<body>"); // #8
    writer.write(result); // #8
    writer.write("</body></html>"); // #8
    writer.close(); // #8
}

From source file:com.liferay.portal.search.lucene.LuceneHelperImpl.java

License:Open Source License

public String getSnippet(Query query, String field, String s, int maxNumFragments, int fragmentLength,
        String fragmentSuffix, String preTag, String postTag) throws IOException {

    SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(preTag, postTag);

    QueryScorer queryScorer = new QueryScorer(query, field);

    Highlighter highlighter = new Highlighter(simpleHTMLFormatter, queryScorer);

    highlighter.setTextFragmenter(new SimpleFragmenter(fragmentLength));

    TokenStream tokenStream = getAnalyzer().tokenStream(field, new UnsyncStringReader(s));

    try {//from  w  w  w .  j  av  a2 s. c o m
        String snippet = highlighter.getBestFragments(tokenStream, s, maxNumFragments, fragmentSuffix);

        if (Validator.isNotNull(snippet) && !StringUtil.endsWith(snippet, fragmentSuffix)) {

            snippet = snippet.concat(fragmentSuffix);
        }

        return snippet;
    } catch (InvalidTokenOffsetsException itoe) {
        throw new IOException(itoe.getMessage());
    }
}

From source file:com.liferay.portal.search.lucene31.LuceneHelperImpl.java

License:Open Source License

public String getSnippet(Query query, String field, String s, int maxNumFragments, int fragmentLength,
        String fragmentSuffix, String preTag, String postTag) throws IOException {

    SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(preTag, postTag);

    QueryScorer queryScorer = new QueryScorer(query, field);

    Highlighter highlighter = new Highlighter(simpleHTMLFormatter, queryScorer);

    highlighter.setTextFragmenter(new SimpleFragmenter(fragmentLength));

    TokenStream tokenStream = getAnalyzer().tokenStream(field, new UnsyncStringReader(s));

    try {//from   w  w w  .ja va 2  s .c om
        String snippet = highlighter.getBestFragments(tokenStream, s, maxNumFragments, fragmentSuffix);

        if (Validator.isNotNull(snippet) && !StringUtil.endsWith(snippet, fragmentSuffix)) {

            snippet = snippet + fragmentSuffix;
        }

        return snippet;
    } catch (InvalidTokenOffsetsException itoe) {
        throw new IOException(itoe.getMessage());
    }
}

From source file:com.mathworks.xzheng.tools.HighlightIt.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        System.err.println("Usage: HighlightIt <filename-out>");
        System.exit(-1);/*from  w  w w .j a va 2  s. c om*/
    }

    String filename = args[0];

    String searchText = "term"; // #1
    QueryParser parser = new QueryParser(Version.LUCENE_46, // #1
            "f", // #1
            new StandardAnalyzer(Version.LUCENE_46));// #1
    Query query = parser.parse(searchText); // #1

    SimpleHTMLFormatter formatter = // #2
            new SimpleHTMLFormatter("<span class=\"highlight\">", // #2
                    "</span>"); // #2

    TokenStream tokens = new StandardAnalyzer(Version.LUCENE_46) // #3
            .tokenStream("f", new StringReader(text)); // #3

    QueryScorer scorer = new QueryScorer(query, "f"); // #4

    Highlighter highlighter = new Highlighter(formatter, scorer); // #5
    highlighter.setTextFragmenter( // #6
            new SimpleSpanFragmenter(scorer)); // #6

    String result = // #7
            highlighter.getBestFragments(tokens, text, 3, "..."); // #7

    FileWriter writer = new FileWriter(filename); // #8
    writer.write("<html>"); // #8
    writer.write("<style>\n" + // #8
            ".highlight {\n" + // #8
            " background: yellow;\n" + // #8
            "}\n" + // #8
            "</style>"); // #8
    writer.write("<body>"); // #8
    writer.write(result); // #8
    writer.write("</body></html>"); // #8
    writer.close(); // #8
}

From source file:com.meltmedia.cadmium.search.SearchService.java

License:Apache License

private Map<String, Object> buildSearchResults(final String query, final String path) throws Exception {
    logger.info("Running search for [{}]", query);
    final Map<String, Object> resultMap = new LinkedHashMap<String, Object>();

    new SearchTemplate(provider) {
        public void doSearch(IndexSearcher index) throws IOException, ParseException {
            QueryParser parser = createParser(getAnalyzer());

            resultMap.put("number-hits", 0);

            List<Map<String, Object>> resultList = new ArrayList<Map<String, Object>>();

            resultMap.put("results", resultList);

            if (index != null && parser != null) {
                String literalQuery = query.replaceAll(ALLOWED_CHARS_PATTERN, "\\\\$1");
                Query query1 = parser.parse(literalQuery);
                if (StringUtils.isNotBlank(path)) {
                    Query pathPrefix = new PrefixQuery(new Term("path", path));
                    BooleanQuery boolQuery = new BooleanQuery();
                    boolQuery.add(pathPrefix, Occur.MUST);
                    boolQuery.add(query1, Occur.MUST);
                    query1 = boolQuery;//from   ww w  .jav a 2s . c om
                }
                TopDocs results = index.search(query1, null, 100000);
                QueryScorer scorer = new QueryScorer(query1);
                Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
                        scorer);

                logger.info("Search returned {} hits.", results.totalHits);
                resultMap.put("number-hits", results.totalHits);

                for (ScoreDoc doc : results.scoreDocs) {
                    Document document = index.doc(doc.doc);
                    String content = document.get("content");
                    String title = document.get("title");

                    Map<String, Object> result = new LinkedHashMap<String, Object>();
                    String excerpt = "";

                    try {
                        excerpt = highlighter.getBestFragments(
                                parser.getAnalyzer().tokenStream(null, new StringReader(content)), content, 3,
                                "...");
                        excerpt = fixExcerpt(excerpt);

                        result.put("excerpt", excerpt);
                    } catch (Exception e) {
                        logger.debug("Failed to get search excerpt from content.", e);

                        try {
                            excerpt = highlighter.getBestFragments(
                                    parser.getAnalyzer().tokenStream(null, new StringReader(title)), title, 1,
                                    "...");
                            excerpt = fixExcerpt(excerpt);

                            result.put("excerpt", excerpt);
                        } catch (Exception e1) {
                            logger.debug("Failed to get search excerpt from title.", e1);

                            result.put("excerpt", "");
                        }
                    }

                    result.put("score", doc.score);
                    result.put("title", title);
                    result.put("path", document.get("path"));

                    resultList.add(result);
                }
            }

        }
    }.search();

    return resultMap;
}

From source file:com.recomdata.search.Finder.java

License:Open Source License

private void display(Document doc, int id, float score, Query query, Analyzer analyzer) {

    System.out.println("repository = " + doc.get("repository"));
    System.out.println("path       = " + doc.get("path"));
    System.out.println("extension  = " + doc.get("extension"));
    System.out.println("title      = " + doc.get("title"));

    Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"),
            new QueryScorer(query, "contents"));
    highlighter.setTextFragmenter(new SimpleFragmenter(50));
    String summary = doc.get("contents");
    TokenStream tokenStream = analyzer.tokenStream("contents", new StringReader(summary));
    try {//from  w w  w .j  a  v a  2s. c o m
        System.out.println("contents   = " + highlighter.getBestFragments(tokenStream, summary, 5, "..."));
    } catch (IOException e) {
        System.out.println("exception: " + e.getMessage());
    }

    System.out.println();
}

From source file:com.taobao.common.tedis.support.lucene.analysis.xanalyzer.TestHighLight.java

License:Open Source License

/**
 * @param args/*from ww w.j  av a  2s  .  co  m*/
 */
public static void main(String[] args) {

    Directory ramDir = new RAMDirectory();
    try {
        IndexWriter writer = new IndexWriter(ramDir, /*
                                                      * new
                                                      * StandardAnalyzer()/
                                                      */XFactory.getWriterAnalyzer());
        Document doc = new Document();
        Field fd = new Field(FIELD_NAME, CONTENT, Field.Store.YES, Field.Index.TOKENIZED,
                Field.TermVector.WITH_POSITIONS_OFFSETS);
        doc.add(fd);
        writer.addDocument(doc);
        writer.optimize();
        writer.close();

        IndexReader reader = IndexReader.open(ramDir);
        String queryString = QUERY;
        QueryParser parser = new QueryParser(FIELD_NAME, /*
                                                          * new
                                                          * StandardAnalyzer
                                                          * ()/
                                                          */XFactory.getWriterAnalyzer());
        Query query = parser.parse(queryString);
        System.out.println(query);
        Searcher searcher = new IndexSearcher(ramDir);
        query = query.rewrite(reader);
        System.out.println(query);
        System.out.println("Searching for: " + query.toString(FIELD_NAME));
        Hits hits = searcher.search(query);

        BoldFormatter formatter = new BoldFormatter();
        Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(50));
        for (int i = 0; i < hits.length(); i++) {
            String text = hits.doc(i).get(FIELD_NAME);
            int maxNumFragmentsRequired = 5;
            String fragmentSeparator = "...";
            TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(hits.id(i), FIELD_NAME);
            TokenStream tokenStream = TokenSources.getTokenStream(tpv);
            /*
             * TokenStream tokenStream2= (new StandardAnalyzer())
             * //XFactory.getWriterAnalyzer() .tokenStream(FIELD_NAME,new
             * StringReader(text));
             *
             * do { Token t = tokenStream2.next(); if(t==null)break;
             * System.out.println("\t" + t.startOffset() + "," +
             * t.endOffset() + "\t" + t.termText()); }while(true);
             */
            String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
                    fragmentSeparator);
            System.out.println("\n" + result);
        }
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}