public int getMaxDocCharsToAnalyze() 

From source file:com.difference.historybook.index.lucene.LuceneIndex.java

License:Apache License

public SearchResultWrapper search(String collection, String query, int offset, int size, boolean includeDebug)
        throws IndexException {
    try {/*from   w w  w.j av  a 2s  .c om*/
        //TODO: make age be a component in the ranking?
        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
        queryBuilder.add(parser.parse(query), Occur.MUST);
        queryBuilder.add(new TermQuery(new Term(IndexDocumentAdapter.FIELD_COLLECTION, collection)),
        Query baseQuery = queryBuilder.build();

        FunctionQuery boostQuery = new FunctionQuery(
                new ReciprocalFloatFunction(new DurationValueSource(new Date().getTime() / 1000,
                        new LongFieldSource(IndexDocumentAdapter.FIELD_TIMESTAMP)), RECIP, 1F, 1F));

        Query q = new CustomScoreQuery(baseQuery, boostQuery);

        QueryScorer queryScorer = new QueryScorer(q, IndexDocumentAdapter.FIELD_SEARCH);
        Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        Highlighter highlighter = new Highlighter(queryScorer);

        GroupingSearch gsearch = new GroupingSearch(IndexDocumentAdapter.FIELD_URL_GROUP).setGroupDocsLimit(1)
        TopGroups<?> groups = gsearch.search(searcher, q, offset, size);

        ArrayList<SearchResult> results = new ArrayList<>(size);
        for (int i = offset; i < offset + size && i < groups.groups.length; i++) {
            ScoreDoc scoreDoc = groups.groups[i].scoreDocs[0];
            Document luceneDoc = searcher.doc(scoreDoc.doc);
            IndexDocumentAdapter doc = new IndexDocumentAdapter(luceneDoc);

            TokenStream tokenStream = TokenSources.getTokenStream(IndexDocumentAdapter.FIELD_SEARCH,
                    reader.getTermVectors(scoreDoc.doc), luceneDoc.get(IndexDocumentAdapter.FIELD_SEARCH),
                    analyzer, highlighter.getMaxDocCharsToAnalyze() - 1);

            String[] snippets = highlighter.getBestFragments(tokenStream,
                    luceneDoc.get(IndexDocumentAdapter.FIELD_SEARCH), 3);
            String snippet = Arrays.asList(snippets).stream().collect(Collectors.joining("\n"));
            snippet = Jsoup.clean(snippet, Whitelist.simpleText());

            String debugInfo = null;
            if (includeDebug) {
                Explanation explanation = searcher.explain(q, scoreDoc.doc);
                debugInfo = explanation.toString();

            results.add(new SearchResult(doc.getKey(), doc.getCollection(), doc.getTitle(), doc.getUrl(),
                    doc.getDomain(), doc.getTimestampText(), snippet, debugInfo, scoreDoc.score));

        SearchResultWrapper wrapper = new SearchResultWrapper().setQuery(query).setOffset(offset)
                .setResultCount(groups.totalGroupCount != null ? groups.totalGroupCount : 0)

        if (includeDebug) {

        return wrapper;

    } catch (IOException | ParseException | InvalidTokenOffsetsException e) {
        throw new IndexException(e);

From source file:com.tripod.lucene.service.AbstractLuceneService.java

License:Apache License

 * Performs highlighting for a given query and a given document.
 * @param indexSearcher the IndexSearcher performing the query
 * @param query the Tripod LuceneQuery//from w  w w.j  a  v a2s .  c  o  m
 * @param scoreDoc the Lucene ScoreDoc
 * @param doc the Lucene Document
 * @param highlighter the Highlighter to use
 * @param result the QueryResult to add the highlights to
 * @throws IOException if an error occurs performing the highlighting
 * @throws InvalidTokenOffsetsException if an error occurs performing the highlighting
protected void performHighlighting(final IndexSearcher indexSearcher, final Q query, final ScoreDoc scoreDoc,
        final Document doc, final Highlighter highlighter, final QR result)
        throws IOException, InvalidTokenOffsetsException {

    if (query.getHighlightFields() == null || query.getHighlightFields().isEmpty()) {

    final List<Highlight> highlights = new ArrayList<>();
    final List<String> hlFieldNames = getHighlightFieldNames(query, doc);

    // process each field to highlight on
    for (String hlField : hlFieldNames) {
        final String text = doc.get(hlField);
        if (StringUtils.isEmpty(text)) {

        final List<String> snippets = new ArrayList<>();
        final Fields tvFields = indexSearcher.getIndexReader().getTermVectors(scoreDoc.doc);
        final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;

        // get the snippets for the given field
        final TokenStream tokenStream = TokenSources.getTokenStream(hlField, tvFields, text, analyzer,
        final TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, false, 10);
        for (TextFragment textFragment : textFragments) {
            if (textFragment != null && textFragment.getScore() > 0) {

        // if we have snippets then add a highlight result to the QueryResult
        if (snippets.size() > 0) {
            highlights.add(new Highlight(hlField, snippets));
