Example usage for org.apache.lucene.search.highlight Highlighter Highlighter

List of usage examples for org.apache.lucene.search.highlight Highlighter Highlighter

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter Highlighter.

Prototype

public Highlighter(Formatter formatter, Scorer fragmentScorer) 

Source Link

Usage

From source file:Main.WebAPI.Search.java

/**
 * /*from w  w  w  .jav  a2 s  .c  o m*/
 * @param args args[0] is a query
 * 
 * @throws IOException
 * @throws ParseException
 * @throws InvalidTokenOffsetsException 
 */

public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
    //... Above, create documents with two fields, one with term vectors (tv) and one without (notv)
    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    Directory index = FSDirectory.open(new File("data/indexing"));
    String querystr = args.length > 0 ? args[0] : "mike lab";
    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer)
            .parse(querystr);

    // 3. search
    int hitsPerPage = 10;
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);

    TopDocs hits = searcher.search(query, 10);

    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
    String Preview;
    for (int i = 0; i < 10; i++) {
        int id = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(id);
        String text;
        Preview = "";
        System.out.println(doc.get("url"));
        System.out.println(doc.get("title"));
        text = doc.get("content");
        TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content",
                analyzer);
        TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
        int k = 0;
        for (TextFragment frag1 : frag) {
            if ((frag1 != null) && (frag1.getScore() > 0)) {
                Preview += (frag1.toString()) + "...<br>";
                k++;
                // Get 2 Line Preview
                if (k >= 2)
                    break;
            }
        }
        //Term vector
        System.out.println("-------------");
    }
}

From source file:net.riezebos.thoth.content.search.Searcher.java

License:Apache License

public PagedList<SearchResult> search(Identity identity, String queryExpression, int pageNumber, int pageSize)
        throws SearchException {
    try {/*from   w  w  w  .  j  a va 2 s. c om*/
        IndexReader reader = getIndexReader(contentManager);
        IndexSearcher searcher = getIndexSearcher(reader);
        Analyzer analyzer = new StandardAnalyzer();

        // We might need to restrict the results to books of the user does not have access to fragments:
        AccessManager accessManager = contentManager.getAccessManager();
        boolean booksOnly = !accessManager.hasPermission(identity, "", Permission.READ_FRAGMENTS);
        if (booksOnly) {
            queryExpression = Indexer.INDEX_TYPE + ":" + Indexer.TYPE_DOCUMENT + " AND (" + queryExpression
                    + ")";
        }

        QueryParser parser = new QueryParser(Indexer.INDEX_CONTENTS, analyzer);
        Query query = parser.parse(queryExpression);

        // We add 1 to determine if there is more to be found after the current page
        int maxResults = pageSize * pageNumber + 1;
        TopDocs results = searcher.search(query, maxResults, Sort.RELEVANCE);
        ScoreDoc[] hits = results.scoreDocs;

        boolean hadMore = (hits.length == maxResults);

        List<SearchResult> searchResults = new ArrayList<>();
        int idx = 0;
        for (ScoreDoc scoreDoc : hits) {
            if (searchResults.size() == pageSize)
                break;
            idx++;
            if (idx >= (pageNumber - 1) * pageSize) {
                Document document = searcher.doc(scoreDoc.doc);
                IndexableField field = document.getField(Indexer.INDEX_PATH);
                String documentPath = field.stringValue();
                SearchResult searchResult = new SearchResult();
                searchResult.setIndexNumber((pageNumber - 1) * pageSize + idx);
                searchResult.setDocument(documentPath);

                String type = document.get(Indexer.INDEX_TYPE);
                if (Indexer.TYPE_DOCUMENT.equals(type) || Indexer.TYPE_FRAGMENT.equals(type)) {
                    searchResult.setResource(false);

                    try {
                        MarkDownDocument markDownDocument = contentManager.getMarkDownDocument(documentPath,
                                true, CriticProcessingMode.DO_NOTHING);
                        String contents = markDownDocument.getMarkdown();

                        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
                        Highlighter highlighter = new Highlighter(htmlFormatter,
                                new QueryScorer(query, Indexer.INDEX_CONTENTS));
                        highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

                        TokenStream tokenStream = analyzer.tokenStream(Indexer.INDEX_CONTENTS, contents);

                        TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, contents, false,
                                99999);
                        for (TextFragment frag : frags) {
                            if ((frag != null) && (frag.getScore() > 0)) {
                                String fragmentText = frag.toString();
                                searchResult.addFragment(
                                        new Fragment(ThothCoreUtil.escapeHtmlExcept("B", fragmentText)));
                            }
                        }
                    } catch (FileNotFoundException e) {
                        LOG.warn(
                                "Index contains an invalid file reference); probably need to reindex to get rid of this. File: "
                                        + e.getMessage());
                    }
                } else {
                    searchResult.setResource(true);
                    String extension = ThothUtil.getExtension(documentPath);
                    searchResult.setImage(getConfiguration().isImageExtension(extension));

                    searchResult.addFragment(new Fragment(document.get(Indexer.INDEX_TITLE)));
                }
                searchResults.add(searchResult);
            }
        }
        reader.close();
        linkBooks(searchResults);
        PagedList<SearchResult> pagedList = new PagedList<>(searchResults, hadMore);
        return pagedList;
    } catch (Exception e) {
        throw new SearchException(e);
    }
}

From source file:net.sf.mmm.search.engine.impl.lucene.LuceneSearchHighlighter.java

License:Apache License

/**
 * The constructor.//from   w ww.  j a  v  a  2  s .co m
 *
 * @param searchAnalyzer is the {@link Analyzer} used by the {@link LuceneSearchEngine search-engine}.
 * @param formatter is the formatter used to highlight terms.
 * @param searchQuery is the {@link Query} of the search. Matching terms of this query shall be highlighted.
 */
public LuceneSearchHighlighter(Analyzer searchAnalyzer, Formatter formatter, Query searchQuery) {

    super();
    this.highlighter = new Highlighter(formatter, new QueryScorer(searchQuery));
    this.analyzer = searchAnalyzer;
}

From source file:net.sf.zekr.engine.search.lucene.QuranTextSearcher.java

/**
 * Main search method, for internal use.
 * //www .ja  va 2s  . c o  m
 * @param q query string
 * @return a list of highlighted string objects.
 * @throws SearchException
 */
private List<SearchResultItem> internalSearch(String q) throws SearchException {
    IndexSearcher is = null;
    try {
        is = new IndexSearcher(zekrIndexReader.indexReader);

        // analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        // resultTokenStream = new StandardTokenizer(Version.LUCENE_CURRENT, reader);

        QueryParser parser = QueryParserFactory.create(Version.LUCENE_CURRENT, QuranTextIndexer.CONTENTS_FIELD,
                analyzer);

        // allow search terms like "*foo" with leading star
        parser.setAllowLeadingWildcard(true);
        // parser.setFuzzyPrefixLength(10);

        // if this line is not set, highlighter doesn't work in in wildcard queries while query.rewrite() is done.
        // and sorting also doesn't work correctly for wildcard queries.
        parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);

        logger.debug("Parse query.");
        query = parser.parse(q);
        BooleanQuery.setMaxClauseCount(maxClauseCount);

        logger.debug("Rewrite query.");
        query = query.rewrite(zekrIndexReader.indexReader); // required to expand search terms

        logger.debug("Searching for: " + query.toString());
        // Hits hits;
        TopFieldDocs tops = null;
        is.setDefaultFieldSortScoring(true, true);
        if (searchScope != null && searchScope.getScopeItems().size() > 0) {
            String scopeQuery = makeSearchScope();
            logger.debug("Scope is: " + scopeQuery);
            // hits = is.search(query, new QuranRangeFilter(searchScope), sortResultOrder);
            tops = is.search(query, new QuranRangeFilter(searchScope), maxSearchResult, sortResultOrder);

        } else {
            // hits = is.search(query, new QueryWrapperFilter(query), 20, sortResultOrder);
            tops = is.search(query, new QueryWrapperFilter(query), maxSearchResult, sortResultOrder);
        }

        logger.debug("Highlight search result.");
        Highlighter highlighter = new Highlighter(highlightFormatter, new QueryScorer(query));
        // highlighter.setFragmentScorer(new QueryTermScorer(query));

        int total = Math.min(maxSearchResult, tops.totalHits);
        List<SearchResultItem> res = new ArrayList<SearchResultItem>(total);
        for (int i = 0; i < total; i++) {
            ScoreDoc[] sd = tops.scoreDocs;
            Document doc = is.doc(sd[i].doc);
            final String contents = doc.get(QuranTextIndexer.CONTENTS_FIELD);
            final IQuranLocation location = new QuranLocation(doc.get(QuranTextIndexer.LOCATION_FIELD));
            TokenStream tokenStream = analyzer.tokenStream(QuranTextIndexer.CONTENTS_FIELD,
                    new StringReader(contents));

            // String resultStr = highlighter.getBestFragment(tokenStream, contents);
            String resultStr = highlighter.getBestFragments(tokenStream, contents, 100, "...");
            SearchResultItem sri = new SearchResultItem(resultStr, location);
            res.add(sri);
        }
        matchedItemCount = highlightFormatter.getHighlightCount();
        // highlightedTermList = highlightFormatter.getHighlightedTermList();
        return res;
    } catch (Exception e) {
        throw new SearchException(e);
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
            }
        }
    }
}

From source file:net.skyatlas.icd.test.AnsegTest.java

private String toHighlighter(Analyzer analyzer, Query query, Document doc) throws InvalidTokenOffsetsException {
    String field = "text";
    try {/*from   w w w.j  a  v  a 2s.  co  m*/
        SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
        Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
        TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field)));
        String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
        return highlighterStr == null ? doc.get(field) : highlighterStr;
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (InvalidTokenOffsetsException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return null;
}

From source file:net.sourceforge.docfetcher.model.search.HighlightService.java

License:Open Source License

@MutableCopy
@NotNull//from   w ww.  j a  v  a2  s .c o m
private static List<Range> highlight(@NotNull Query query, @NotNull String text)
        throws CheckedOutOfMemoryError {
    final List<Range> ranges = new ArrayList<Range>();
    /*
     * A formatter is supposed to return formatted text, but since we're
     * only interested in the start and end offsets of the search terms, we
     * return null and store the offsets in a list.
     */
    Formatter nullFormatter = new Formatter() {
        public String highlightTerm(String originalText, TokenGroup tokenGroup) {
            for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                Token token = tokenGroup.getToken(i);
                if (tokenGroup.getScore(i) == 0)
                    continue;
                int start = token.startOffset();
                int end = token.endOffset();
                ranges.add(new Range(start, end - start));
            }
            return null;
        }
    };
    String key = Fields.CONTENT.key();
    Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    highlighter.setTextFragmenter(new NullFragmenter());
    try {
        /*
         * This has a return value, but we ignore it since we only want the
         * offsets. Might throw an OutOfMemoryError.
         */
        highlighter.getBestFragment(IndexRegistry.getAnalyzer(), key, text);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } catch (Exception e) {
        Util.printErr(e);
    }
    return ranges;
}

From source file:net.sourceforge.docfetcher.view.PreviewPanel.java

License:Open Source License

/**
 * Sets the file to be displayed, using <tt>parser</tt> to extract the
 * text from the file on the disk. This method does nothing if the given
 * file is null. The <tt>force</tt> parameter specifies whether the
 * preview should be updated even if neither the file nor the search terms
 * have changed in the meantime.//from ww w.ja  va2s  .c o m
 */
private void setFile(final File file, final Parser parser, final Query query, boolean force) {
    File lastFile = this.file;
    Query lastQuery = this.query;
    this.file = file;
    this.parser = parser;
    this.query = query;

    // Check input
    if (file == null)
        return;
    if (parser == null) // Allowed to be null if file is null, too
        throw new IllegalArgumentException();
    if (!isActive)
        return;
    if (file.equals(lastFile) && !force)
        if (query != null && query.equals(lastQuery))
            return;

    if (file.isDirectory())
        throw new IllegalStateException("File expected for preview, got directory instead."); //$NON-NLS-1$
    if (!file.exists()) {
        textViewer.setText(Msg.file_not_found.value());
        showViewer(textViewerContainer);
        return;
    }

    // Use the HTML browser
    if (file.getAbsolutePath().equals(Const.HELP_FILE) || Pref.Bool.PreviewHTML.getValue()) {
        final BrowserPanel browser = browserProvider.getBrowser(previewPanel, browserToolBar, parser);
        if (browser != null) {
            browser.addProgressListener(new ProgressAdapter() {
                public void completed(ProgressEvent event) {
                    showViewer(browser);
                    upBt.setEnabled(false);
                    downBt.setEnabled(false);
                    occurrenceCounter.setText("0"); //$NON-NLS-1$
                }
            });
            browser.setFile(file);
            return;
        }
        // Browser creation failed, go on to next code block
    }

    // Use text renderers
    showViewer(textViewerContainer);

    // Use monospace font for text files
    if (parser instanceof TextParser) {
        org.eclipse.swt.graphics.Font monoFont = Font.PREVIEW_MONO.getFont();
        if (!textViewer.getFont().equals(monoFont))
            textViewer.setFont(monoFont);
    } else {
        org.eclipse.swt.graphics.Font previewFont = Font.PREVIEW.getFont();
        if (!textViewer.getFont().equals(previewFont))
            textViewer.setFont(previewFont);
    }

    textViewer.setText(Msg.loading.value()); // display loading message

    new Thread() { // run in a thread because parsing the file takes some time
        public void run() {
            // Extract the raw text from the file
            String text;
            boolean fileParsed = true;
            try {
                text = parser.renderText(file);
            } catch (ParseException e) {
                text = Msg.cant_read_file.format(e.getMessage());
                fileParsed = false;
            } catch (OutOfMemoryError e) {
                /*
                 * We can get here if the user sets a high java heap space
                 * value during indexing and then sets a lower value for
                 * search only usage.
                 */
                text = Msg.out_of_jvm_memory.value();
                fileParsed = false;
            }

            if (PreviewPanel.this.file != file)
                return; // Another preview request had been started while we were parsing

            /*
             * Create the message that will be displayed if the character limit
             * is reached. It is appended to the file contents later; if it
             * was appended here, some words in it might get highlighted.
             */
            int maxLength = Pref.Int.PreviewLimit.getValue();
            final String msg = "...\n\n\n[" //$NON-NLS-1$
                    + Msg.preview_limit_hint.format(new Object[] { maxLength, Pref.Int.PreviewLimit.name(),
                            Const.USER_PROPERTIES_FILENAME })
                    + "]"; //$NON-NLS-1$
            final boolean exceeded = text.length() > maxLength;
            if (text.length() > maxLength)
                text = text.substring(0, maxLength - msg.length());
            final String fText = text;

            /*
             * Create StyleRange ranges (i.e. start-end integer pairs) for
             * search term highlighting. Only tokenize preview text if we're
             * not displaying any info messages and if there are tokens to
             * highlight.
             */
            ranges = new int[0];
            if (fileParsed && query != null) {
                final List<Integer> rangesList = new ArrayList<Integer>();
                Analyzer analyzer = RootScope.analyzer;

                /*
                 * A formatter is supposed to return formatted text, but
                 * since we're only interested in the start and end offsets
                 * of the search terms, we return null and store the offsets
                 * in a list.
                 */
                Formatter nullFormatter = new Formatter() {
                    public String highlightTerm(String originalText, TokenGroup tokenGroup) {
                        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                            Token token = tokenGroup.getToken(i);
                            if (tokenGroup.getScore(i) == 0)
                                continue;
                            int start = token.startOffset();
                            int end = token.endOffset();
                            rangesList.add(start);
                            rangesList.add(end - start);
                        }
                        return null;
                    }
                };

                Highlighter highlighter = new Highlighter(nullFormatter,
                        new QueryScorer(query, Document.contents));
                highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
                highlighter.setTextFragmenter(new NullFragmenter());
                try {
                    /*
                     * This has a return value, but we ignore it since we
                     * only want the offsets.
                     */
                    highlighter.getBestFragment(analyzer, Document.contents, fText);
                } catch (Exception e) {
                    // We can do without the search term highlighting
                }

                // List to array (will be used by the method 'setHighlighting(..)')
                ranges = new int[rangesList.size()];
                for (int i = 0; i < ranges.length; i++)
                    ranges[i] = rangesList.get(i);
            }

            // Parsing and tokenizing done; display the results
            final boolean fFileParsed = fileParsed;
            Display.getDefault().syncExec(new Runnable() {
                public void run() {
                    // Enable or disable up and down buttons
                    upBt.setEnabled(ranges.length != 0);
                    downBt.setEnabled(ranges.length != 0);

                    textViewer.setText(fText);
                    setHighlighting(fFileParsed && Pref.Bool.HighlightSearchTerms.getValue());
                    occurrenceCounter.setText(Integer.toString(ranges.length / 2));
                    if (exceeded)
                        textViewer.append(msg); // character limit exceeded, append hint
                }
            });
        }
    }.start();
}

From source file:net.sourceforge.vaticanfetcher.model.search.HighlightService.java

License:Open Source License

@MutableCopy
@NotNull/*from www.j a  va2  s  .  c o  m*/
private static List<Range> highlight(@NotNull Query query, @NotNull String text)
        throws CheckedOutOfMemoryError {
    final List<Range> ranges = new ArrayList<Range>();
    /*
     * A formatter is supposed to return formatted text, but since we're
     * only interested in the start and end offsets of the search terms, we
     * return null and store the offsets in a list.
     */
    Formatter nullFormatter = new Formatter() {
        public String highlightTerm(String originalText, TokenGroup tokenGroup) {
            for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                Token token = tokenGroup.getToken(i);
                if (tokenGroup.getScore(i) == 0)
                    continue;
                int start = token.startOffset();
                int end = token.endOffset();
                ranges.add(new Range(start, end - start));
            }
            return null;
        }
    };
    String key = Fields.CONTENT.key();
    Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    highlighter.setTextFragmenter(new NullFragmenter());
    try {
        /*
         * This has a return value, but we ignore it since we only want the
         * offsets. Might throw an OutOfMemoryError.
         */
        highlighter.getBestFragment(IndexRegistry.analyzer, key, text);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } catch (Exception e) {
        Util.printErr(e);
    }
    return ranges;
}

From source file:org.apache.blur.utils.HighlightHelper.java

License:Apache License

/**
 * NOTE: This method will not preserve the correct field types.
 * //from w w w .j  a v a  2s .c  o m
 * @param preTag
 * @param postTag
 */
public static Document highlight(int docId, Document document, Query query, FieldManager fieldManager,
        IndexReader reader, String preTag, String postTag) throws IOException, InvalidTokenOffsetsException {

    String fieldLessFieldName = fieldManager.getFieldLessFieldName();

    Query fixedQuery = fixSuperQuery(query, null, fieldLessFieldName);

    Analyzer analyzer = fieldManager.getAnalyzerForQuery();

    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(preTag, postTag);
    Document result = new Document();
    for (IndexableField f : document) {
        String name = f.name();
        if (fieldLessFieldName.equals(name) || FIELDS_NOT_TO_HIGHLIGHT.contains(name)) {
            result.add(f);
            continue;
        }
        String text = f.stringValue();
        Number numericValue = f.numericValue();

        Query fieldFixedQuery;
        if (fieldManager.isFieldLessIndexed(name)) {
            fieldFixedQuery = fixSuperQuery(query, name, fieldLessFieldName);
        } else {
            fieldFixedQuery = fixedQuery;
        }

        if (numericValue != null) {
            if (shouldNumberBeHighlighted(name, numericValue, fieldFixedQuery)) {
                String numberHighlight = preTag + text + postTag;
                result.add(new StringField(name, numberHighlight, Store.YES));
            }
        } else {
            Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(fieldFixedQuery, name));
            TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, docId, name, analyzer);
            TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
            for (int j = 0; j < frag.length; j++) {
                if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                    result.add(new StringField(name, frag[j].toString(), Store.YES));
                }
            }
        }
    }
    return result;
}

From source file:org.apache.jena.query.text.TextIndexLucene.java

License:Apache License

private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, String field,
        String highlight) throws IOException, InvalidTokenOffsetsException {
    List<TextHit> results = new ArrayList<>();

    HighlightOpts opts = new HighlightOpts(highlight);

    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(opts.start, opts.end);
    Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
    highlighter.setTextFragmenter(new SimpleFragmenter(opts.fragSize));

    for (ScoreDoc sd : sDocs) {
        Document doc = indexSearcher.doc(sd.doc);
        log.trace("highlightResults[{}]: {}", sd.doc, doc);
        String entity = doc.get(docDef.getEntityField());

        Node literal = null;//from  w w w  .j  a  v a2  s  .c  o  m
        String lexical = doc.get(field);
        if (lexical != null) {
            String docLang = doc.get(docDef.getLangField());
            TokenStream tokenStream = analyzer.tokenStream(field, lexical);
            TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags,
                    opts.maxFrags);
            String rez = frags2string(frags, opts);

            literal = NodeFactory.createLiteral(rez, docLang);
        }

        String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null;
        Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null;

        Node subject = TextQueryFuncs.stringToNode(entity);
        TextHit hit = new TextHit(subject, sd.score, literal, graph);
        results.add(hit);
    }
    return results;
}