Example usage for org.apache.lucene.search.highlight Highlighter setTextFragmenter


public void setTextFragmenter(Fragmenter fragmenter) 

From source file:lius.search.LiusHitList.java

License:Apache License

private LiusHit buildLiusHit(int index) throws IOException {

    LiusHit liusHit = new LiusHit();

    Document luceneDocument = luceneHits.doc(index);

    Map liusHitFieldsMap = new HashMap();
    List liusFieldsList = new ArrayList();
    Highlighter luceneHighlighter = null;

    if (liusConfig.getHighlighter() == true) {
        IndexReader luceneIndexReader = IndexReader.open(indexDirectory);

        Query rewrittenLuceneQuery = luceneQuery.rewrite(luceneIndexReader);
        QueryScorer luceneScorer = new QueryScorer(rewrittenLuceneQuery);

        SimpleHTMLFormatter luceneFormatter = new SimpleHTMLFormatter("<span class=\"liusHit\">", "</span>");
        luceneHighlighter = new Highlighter(luceneFormatter, luceneScorer);
    }//www.  ja  v a 2  s.c  om

    for (int j = 0; j < liusConfig.getDisplayFields().size(); j++) {
        LiusField configLiusField = (LiusField) liusConfig.getDisplayFields().get(j);
        LiusField hitLiusField = new LiusField();
        String fieldName = configLiusField.getName();


        if (luceneHighlighter != null) {
            Fragmenter luceneFragmenter;
            if (configLiusField.getFragmenter() != null) {
                luceneFragmenter = new SimpleFragmenter(Integer.parseInt(configLiusField.getFragmenter()));
            } else {
                luceneFragmenter = new SimpleFragmenter(Integer.MAX_VALUE);
        String[] luceneDocumentValues = luceneDocument.getValues(configLiusField.getName());
        if (luceneDocumentValues != null) {
            if (luceneHighlighter != null) {
                for (int k = 0; k < luceneDocumentValues.length; k++) {
                    Analyzer luceneAnalyzer = AnalyzerFactory.getAnalyzer(liusConfig);
                    TokenStream luceneTokenStream = luceneAnalyzer.tokenStream(configLiusField.getName(),
                            new StringReader(luceneDocumentValues[k]));
                    String fragment = null;
                    if (configLiusField.getFragmenter() != null)
                        fragment = luceneHighlighter.getBestFragments(luceneTokenStream,
                                luceneDocumentValues[k], 5, "...");
                    else {
                        fragment = luceneHighlighter.getBestFragment(luceneTokenStream,

                    if (fragment == null) {
                    } else {
                        luceneDocumentValues[k] = fragment;


            liusHitFieldsMap.put(configLiusField.getName(), hitLiusField);

    return liusHit;

From source file:lucandra.LucandraTests.java

License:Apache License

public void testHighlight() throws Exception {

    // This tests the TermPositionVector classes

    IndexReader indexReader = new IndexReader(indexName, client);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "key", analyzer);

    // check exact
    Query q = qp.parse("+key:\"foobar foobar\"");
    TopDocs docs = searcher.search(q, 10);
    assertEquals(1, docs.totalHits);// ww  w . j  a  va 2  s.  co m

    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
    QueryScorer scorer = new QueryScorer(q, "key", text);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));

    TokenStream tvStream = TokenSources.getTokenStream(indexReader, docs.scoreDocs[0].doc, "key");

    String rv = highlighter.getBestFragment(tvStream, text);

    assertEquals(rv, highlightedText);

From source file:natural.language.qa.LuceneSearch.java

License:Apache License

public List<LuceneSearchResult> search(String queryString, int maxRes) throws Exception {
    IndexSearcher searcher = null;//from  w ww . ja  va  2s  .c  o m
    List<LuceneSearchResult> results = new ArrayList<LuceneSearchResult>();
    try {
        Properties indexConf = new Properties();
        FileInputStream fis = new FileInputStream("index.properties");

        String index = indexConf.getProperty("index");
        String field = "contents";

        Directory indexDir = FSDirectory.open(new File(index));

        searcher = new IndexSearcher(indexDir);
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);

        QueryParser parser = new QueryParser(Version.LUCENE_31, field, analyzer);

        queryString = queryString.trim();
        if (queryString.length() == 0) {
            return results;

        Query query = parser.parse(queryString);
        System.out.println("Searching for: " + query.toString(field));

        // ================================================
        Formatter f = new SimpleHTMLFormatter("", "");
        Encoder e = new DefaultEncoder();
        QueryScorer fs = new QueryScorer(query);
        Fragmenter fragmenter = new SimpleSpanFragmenter(fs, 50);// new SentenceFragmenter();
        Highlighter h = new Highlighter(f, e, fs);

        // ================================================

        // Collect docs
        TopDocs res = searcher.search(query, maxRes);
        int numTotalHits = res.totalHits;
        ScoreDoc[] scoreDocs = res.scoreDocs;

        for (ScoreDoc scoreDoc : scoreDocs) {
            Document doc = searcher.doc(scoreDoc.doc);
            String path = doc.get("path");
            String content = readDocument(path);
            String bestFragment = h.getBestFragment(analyzer, field, content);
            String frag = bestFragment;
            LuceneSearchResult hit = new LuceneSearchResult(scoreDoc.doc, path, frag);
        System.out.println(numTotalHits + " total matching documents");
    } finally {
        if (searcher != null) {
    return results;

From source file:net.chwise.documents.HighlightedFragmentsRetriever.java

License:Open Source License

public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String fieldName,
        String fieldContents, int fragmentNumber, int fragmentSize)
        throws IOException, InvalidTokenOffsetsException {

    TokenStream stream = TokenSources.getTokenStream(fieldName, fieldContents, analyzer);
    QueryScorer scorer = new QueryScorer(query, fieldName);
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentSize);

    Highlighter highlighter = new Highlighter(scorer);

    String[] fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);

    if (fragments.length == 0) {
        //Return starting piece of fieldContents fragment
        fragments = new String[1];
        fragments[0] = fieldContents.substring(0, Math.min(fragmentSize, fieldContents.length()));
    }//from www. j ava2  s . c  o  m

    return fragments;

From source file:net.sourceforge.docfetcher.model.search.HighlightService.java

License:Open Source License

@NotNull//from ww w.j av  a  2 s. c  o  m
private static List<Range> highlight(@NotNull Query query, @NotNull String text)
        throws CheckedOutOfMemoryError {
    final List<Range> ranges = new ArrayList<Range>();
     * A formatter is supposed to return formatted text, but since we're
     * only interested in the start and end offsets of the search terms, we
     * return null and store the offsets in a list.
    Formatter nullFormatter = new Formatter() {
        public String highlightTerm(String originalText, TokenGroup tokenGroup) {
            for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                Token token = tokenGroup.getToken(i);
                if (tokenGroup.getScore(i) == 0)
                int start = token.startOffset();
                int end = token.endOffset();
                ranges.add(new Range(start, end - start));
            return null;
    String key = Fields.CONTENT.key();
    Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
    highlighter.setTextFragmenter(new NullFragmenter());
    try {
         * This has a return value, but we ignore it since we only want the
         * offsets. Might throw an OutOfMemoryError.
        highlighter.getBestFragment(IndexRegistry.getAnalyzer(), key, text);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } catch (Exception e) {
    return ranges;

From source file:net.sourceforge.docfetcher.view.PreviewPanel.java

License:Open Source License

 * Sets the file to be displayed, using <tt>parser</tt> to extract the
 * text from the file on the disk. This method does nothing if the given
 * file is null. The <tt>force</tt> parameter specifies whether the
 * preview should be updated even if neither the file nor the search terms
 * have changed in the meantime./*from  ww w.java  2 s.  c  o m*/
private void setFile(final File file, final Parser parser, final Query query, boolean force) {
    File lastFile = this.file;
    Query lastQuery = this.query;
    this.file = file;
    this.parser = parser;
    this.query = query;

    // Check input
    if (file == null)
    if (parser == null) // Allowed to be null if file is null, too
        throw new IllegalArgumentException();
    if (!isActive)
    if (file.equals(lastFile) && !force)
        if (query != null && query.equals(lastQuery))

    if (file.isDirectory())
        throw new IllegalStateException("File expected for preview, got directory instead."); //$NON-NLS-1$
    if (!file.exists()) {

    // Use the HTML browser
    if (file.getAbsolutePath().equals(Const.HELP_FILE) || Pref.Bool.PreviewHTML.getValue()) {
        final BrowserPanel browser = browserProvider.getBrowser(previewPanel, browserToolBar, parser);
        if (browser != null) {
            browser.addProgressListener(new ProgressAdapter() {
                public void completed(ProgressEvent event) {
                    occurrenceCounter.setText("0"); //$NON-NLS-1$
        // Browser creation failed, go on to next code block

    // Use text renderers

    // Use monospace font for text files
    if (parser instanceof TextParser) {
        org.eclipse.swt.graphics.Font monoFont = Font.PREVIEW_MONO.getFont();
        if (!textViewer.getFont().equals(monoFont))
    } else {
        org.eclipse.swt.graphics.Font previewFont = Font.PREVIEW.getFont();
        if (!textViewer.getFont().equals(previewFont))

    textViewer.setText(Msg.loading.value()); // display loading message

    new Thread() { // run in a thread because parsing the file takes some time
        public void run() {
            // Extract the raw text from the file
            String text;
            boolean fileParsed = true;
            try {
                text = parser.renderText(file);
            } catch (ParseException e) {
                text = Msg.cant_read_file.format(e.getMessage());
                fileParsed = false;
            } catch (OutOfMemoryError e) {
                 * We can get here if the user sets a high java heap space
                 * value during indexing and then sets a lower value for
                 * search only usage.
                text = Msg.out_of_jvm_memory.value();
                fileParsed = false;

            if (PreviewPanel.this.file != file)
                return; // Another preview request had been started while we were parsing

             * Create the message that will be displayed if the character limit
             * is reached. It is appended to the file contents later; if it
             * was appended here, some words in it might get highlighted.
            int maxLength = Pref.Int.PreviewLimit.getValue();
            final String msg = "...\n\n\n[" //$NON-NLS-1$
                    + Msg.preview_limit_hint.format(new Object[] { maxLength, Pref.Int.PreviewLimit.name(),
                            Const.USER_PROPERTIES_FILENAME })
                    + "]"; //$NON-NLS-1$
            final boolean exceeded = text.length() > maxLength;
            if (text.length() > maxLength)
                text = text.substring(0, maxLength - msg.length());
            final String fText = text;

             * Create StyleRange ranges (i.e. start-end integer pairs) for
             * search term highlighting. Only tokenize preview text if we're
             * not displaying any info messages and if there are tokens to
             * highlight.
            ranges = new int[0];
            if (fileParsed && query != null) {
                final List<Integer> rangesList = new ArrayList<Integer>();
                Analyzer analyzer = RootScope.analyzer;

                 * A formatter is supposed to return formatted text, but
                 * since we're only interested in the start and end offsets
                 * of the search terms, we return null and store the offsets
                 * in a list.
                Formatter nullFormatter = new Formatter() {
                    public String highlightTerm(String originalText, TokenGroup tokenGroup) {
                        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                            Token token = tokenGroup.getToken(i);
                            if (tokenGroup.getScore(i) == 0)
                            int start = token.startOffset();
                            int end = token.endOffset();
                            rangesList.add(end - start);
                        return null;

                Highlighter highlighter = new Highlighter(nullFormatter,
                        new QueryScorer(query, Document.contents));
                highlighter.setTextFragmenter(new NullFragmenter());
                try {
                     * This has a return value, but we ignore it since we
                     * only want the offsets.
                    highlighter.getBestFragment(analyzer, Document.contents, fText);
                } catch (Exception e) {
                    // We can do without the search term highlighting

                // List to array (will be used by the method 'setHighlighting(..)')
                ranges = new int[rangesList.size()];
                for (int i = 0; i < ranges.length; i++)
                    ranges[i] = rangesList.get(i);

            // Parsing and tokenizing done; display the results
            final boolean fFileParsed = fileParsed;
            Display.getDefault().syncExec(new Runnable() {
                public void run() {
                    // Enable or disable up and down buttons
                    upBt.setEnabled(ranges.length != 0);
                    downBt.setEnabled(ranges.length != 0);

                    setHighlighting(fFileParsed && Pref.Bool.HighlightSearchTerms.getValue());
                    occurrenceCounter.setText(Integer.toString(ranges.length / 2));
                    if (exceeded)
                        textViewer.append(msg); // character limit exceeded, append hint

From source file:net.sourceforge.vaticanfetcher.model.search.HighlightService.java

License:Open Source License

@NotNull/*from ww w.j  av a  2 s. co  m*/
private static List<Range> highlight(@NotNull Query query, @NotNull String text)
        throws CheckedOutOfMemoryError {
    final List<Range> ranges = new ArrayList<Range>();
     * A formatter is supposed to return formatted text, but since we're
     * only interested in the start and end offsets of the search terms, we
     * return null and store the offsets in a list.
    Formatter nullFormatter = new Formatter() {
        public String highlightTerm(String originalText, TokenGroup tokenGroup) {
            for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                Token token = tokenGroup.getToken(i);
                if (tokenGroup.getScore(i) == 0)
                int start = token.startOffset();
                int end = token.endOffset();
                ranges.add(new Range(start, end - start));
            return null;
    String key = Fields.CONTENT.key();
    Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
    highlighter.setTextFragmenter(new NullFragmenter());
    try {
         * This has a return value, but we ignore it since we only want the
         * offsets. Might throw an OutOfMemoryError.
        highlighter.getBestFragment(IndexRegistry.analyzer, key, text);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } catch (Exception e) {
    return ranges;

From source file:org.apache.jena.query.text.TextIndexLucene.java

License:Apache License

private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, String field,
        String highlight) throws IOException, InvalidTokenOffsetsException {
    List<TextHit> results = new ArrayList<>();

    HighlightOpts opts = new HighlightOpts(highlight);

    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(opts.start, opts.end);
    Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
    highlighter.setTextFragmenter(new SimpleFragmenter(opts.fragSize));

    for (ScoreDoc sd : sDocs) {
        Document doc = indexSearcher.doc(sd.doc);
        log.trace("highlightResults[{}]: {}", sd.doc, doc);
        String entity = doc.get(docDef.getEntityField());

        Node literal = null;// ww w.ja  v a  2  s .  c o  m
        String lexical = doc.get(field);
        if (lexical != null) {
            String docLang = doc.get(docDef.getLangField());
            TokenStream tokenStream = analyzer.tokenStream(field, lexical);
            TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags,
            String rez = frags2string(frags, opts);

            literal = NodeFactory.createLiteral(rez, docLang);

        String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null;
        Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null;

        Node subject = TextQueryFuncs.stringToNode(entity);
        TextHit hit = new TextHit(subject, sd.score, literal, graph);
    return results;

From source file:org.apache.maven.index.DefaultIteratorResultSet.java

License:Apache License

protected final List<String> getBestFragments(Query query, Formatter formatter, TokenStream tokenStream,
        String text, int maxNumFragments) throws IOException {
    Highlighter highlighter = new Highlighter(formatter, new CleaningEncoder(), new QueryScorer(query));

    highlighter.setTextFragmenter(new OneLineFragmenter());

    maxNumFragments = Math.max(1, maxNumFragments); // sanity check

    TextFragment[] frag;/*from   w  w  w . j a  va  2s.  c  o m*/
    // Get text
    ArrayList<String> fragTexts = new ArrayList<>(maxNumFragments);

    try {
        frag = highlighter.getBestTextFragments(tokenStream, text, false, maxNumFragments);

        for (int i = 0; i < frag.length; i++) {
            if ((frag[i] != null) && (frag[i].getScore() > 0)) {
    } catch (InvalidTokenOffsetsException e) {
        // empty?

    return fragTexts;

From source file:org.apache.nutch.searcher.Summarizer.java

License:Apache License

public static String getsummary(String queryString, String content, Analyzer analyzer) {
    if (queryString == null && content != null) {
        if (content.length() > SUM_LENGTH)
            return content.substring(0, (SUM_LENGTH) - 1);
        else/*  w  w  w.j a  va  2s.  c om*/
            return content;
    } else if (queryString != null && content == null)
        return "";
    else if (queryString == null && content == null)
        return "";
    SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter(cssfront, cssend);

    org.apache.lucene.search.Query summarizerQuery = null;
    QueryParser queryParse = new QueryParser("content", analyzer);
    try {
        summarizerQuery = queryParse.parse(queryString);
    } catch (ParseException ex) {
        if (content.length() > SUM_LENGTH)
            return content.substring(0, (SUM_LENGTH) - 1);
            return content;
    QueryScorer qs = new QueryScorer(summarizerQuery);
    Highlighter highlighter = new Highlighter(sHtmlF, qs);
    highlighter.setTextFragmenter(new SimpleFragmenter(SUM_LENGTH));
    TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content));
    String str;
    try {
        str = highlighter.getBestFragment(tokenStream, content);
    } catch (IOException e) {
        str = null;
    if (str == null) {
        if (content.length() > SUM_LENGTH)
            str = content.substring(0, (SUM_LENGTH) - 1);
            str = content;
    return str;