Example usage for org.apache.lucene.search.highlight Highlighter DEFAULT_MAX_CHARS_TO_ANALYZE

List of usage examples for org.apache.lucene.search.highlight Highlighter DEFAULT_MAX_CHARS_TO_ANALYZE

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter DEFAULT_MAX_CHARS_TO_ANALYZE.

Prototype

int DEFAULT_MAX_CHARS_TO_ANALYZE

To view the source code for org.apache.lucene.search.highlight Highlighter DEFAULT_MAX_CHARS_TO_ANALYZE.

Click Source Link

Usage

From source file:com.o19s.solr.swan.highlight.SwanHighlighter.java

License:Apache License

private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId,
        Document doc, String fieldName) throws IOException {
    final SolrIndexSearcher searcher = req.getSearcher();
    final IndexSchema schema = searcher.getSchema();

    // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
    // so we disable them until fixed (see LUCENE-3080)!
    // BEGIN: Hack
    final SchemaField schemaField = schema.getFieldOrNull(fieldName);
    if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField)
            || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField)))
        return;/*from   w ww  . j  ava2  s  . c  om*/
    // END: Hack

    SolrParams params = req.getParams();
    IndexableField[] docFields = doc.getFields(fieldName);
    List<String> listFields = new ArrayList<String>();
    for (IndexableField field : docFields) {
        listFields.add(field.stringValue());
    }

    String[] docTexts = listFields.toArray(new String[listFields.size()]);

    // according to Document javadoc, doc.getValues() never returns null. check empty instead of null
    if (docTexts.length == 0)
        return;

    TokenStream tokenStream;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    try {
        //      TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName);
        //      if (tvStream != null) {
        //        tots = new TermOffsetsTokenStream(tvStream);
        //      }
    } catch (IllegalArgumentException e) {
        // No problem. But we can't use TermOffsets optimization.
    }

    for (int j = 0; j < docTexts.length; j++) {
        if (tots != null) {
            // if we're using TermOffsets optimization, then get the next
            // field value's TokenStream (i.e. get field j's TokenStream) from tots:
            tokenStream = tots.getMultiValuedTokenStream(docTexts[j].length());
        } else {
            // fall back to analyzer
            tokenStream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
        }

        int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);

        Highlighter highlighter;
        if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
            if (maxCharsToAnalyze < 0) {
                tokenStream = new CachingTokenFilter(tokenStream);
            } else {
                tokenStream = new CachingTokenFilter(
                        new OffsetLimitTokenFilter(tokenStream, maxCharsToAnalyze));
            }

            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tokenStream);

            // after highlighter initialization, reset tstream since construction of highlighter already used it
            tokenStream.reset();
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }

        if (maxCharsToAnalyze < 0) {
            highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
        } else {
            highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        }

        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tokenStream, docTexts[j],
                    mergeContiguousFragments, numFragments);
            for (int k = 0; k < bestTextFragments.length; k++) {
                if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                    frags.add(bestTextFragments[k]);
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // sort such that the fragments with the highest score come first
    Collections.sort(frags, new Comparator<TextFragment>() {
        public int compare(TextFragment arg0, TextFragment arg1) {
            return Math.round(arg1.getScore() - arg0.getScore());
        }
    });

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    String[] summaries = null;
    if (frags.size() > 0) {
        ArrayList<String> fragTexts = new ArrayList<String>();
        for (TextFragment fragment : frags) {
            if ((fragment != null) && (fragment.getScore() > 0)) {
                fragTexts.add(fragment.toString());
            }
            if (fragTexts.size() >= numFragments)
                break;
        }
        summaries = (String[]) fragTexts.toArray();
        if (summaries.length > 0)
            docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
        alternateField(docSummaries, params, doc, fieldName);
    }
}

From source file:com.rapidminer.search.GlobalSearchHandler.java

License:Open Source License

/**
 * Creates the search result for search methods.
 *
 * @param searchTerm//from   w  w w  .ja  v  a2  s.c  o  m
 *       the search string
 * @param searcher
 *       the index searcher instance which was used to search
 * @param result
 *       the result of the search
 * @param highlightResult
 *       if {@code true}, the {@link GlobalSearchResult#getBestFragments()} will be created
 * @return the search result instance, never {@code null}
 * @throws IOException
 *       if something goes wrong
 */
private GlobalSearchResult createSearchResult(final String searchTerm, final Query parsedQuery,
        final IndexSearcher searcher, final TopDocs result, final boolean highlightResult) throws IOException {
    int resultNumber = result.scoreDocs.length;
    List<Document> resultList = new ArrayList<>(resultNumber);
    List<String[]> highlights = highlightResult ? new LinkedList<>() : null;
    ScoreDoc lastResult = resultNumber > 0 ? result.scoreDocs[result.scoreDocs.length - 1] : null;
    for (ScoreDoc scoreDoc : result.scoreDocs) {
        Document doc = searcher.doc(scoreDoc.doc);
        resultList.add(doc);

        if (highlightResult) {
            // search result highlighting best match on name field
            QueryScorer scorer = new QueryScorer(parsedQuery);
            Highlighter highlighter = new Highlighter(HIGHLIGHT_FORMATTER, scorer);
            Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, FRAGMENT_SIZE);
            highlighter.setTextFragmenter(fragmenter);
            try {
                TokenStream stream = TokenSources.getTokenStream(GlobalSearchUtilities.FIELD_NAME,
                        searcher.getIndexReader().getTermVectors(scoreDoc.doc),
                        doc.get(GlobalSearchUtilities.FIELD_NAME), GlobalSearchUtilities.ANALYZER,
                        Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE - 1);
                if (stream != null) {
                    highlights.add(highlighter.getBestFragments(stream,
                            doc.get(GlobalSearchUtilities.FIELD_NAME), MAX_NUMBER_OF_FRAGMENTS));
                } else {
                    highlights.add(null);
                }
            } catch (InvalidTokenOffsetsException e) {
                highlights.add(null);
            }
        }
    }
    return new GlobalSearchResult(resultList, searchTerm, lastResult, result.totalHits, highlights);
}

From source file:org.apache.solr.highlight.DefaultSolrHighlighter.java

License:Apache License

private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId,
        Document doc, String fieldName) throws IOException {
    final SolrIndexSearcher searcher = req.getSearcher();
    final IndexSchema schema = searcher.getSchema();

    // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
    // so we disable them until fixed (see LUCENE-3080)!
    // BEGIN: Hack
    final SchemaField schemaField = schema.getFieldOrNull(fieldName);
    if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField)
            || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField)))
        return;/*from   ww w . ja v  a2s.  c  o  m*/
    // END: Hack

    SolrParams params = req.getParams();

    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);

    List<IndexableField> allFields = doc.getFields();
    if (allFields != null && allFields.size() == 0)
        return; // No explicit contract that getFields returns != null,
    // although currently it can't.

    TokenStream tstream = null;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    String[] summaries = null;
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
    if (tvStream != null) {
        tots = new TermOffsetsTokenStream(tvStream);
    }
    int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
            Integer.toString(Integer.MAX_VALUE)));
    int mvToMatch = Integer.parseInt(
            req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.toString(Integer.MAX_VALUE)));

    for (IndexableField thisField : allFields) {
        if (mvToExamine <= 0 || mvToMatch <= 0)
            break;

        if (!thisField.name().equals(fieldName))
            continue; // Is there a better way to do this?

        --mvToExamine;
        String thisText = thisField.stringValue();

        if (tots != null) {
            // if we're using TermOffsets optimization, then get the next
            // field value's TokenStream (i.e. get field j's TokenStream) from tots:
            tstream = tots.getMultiValuedTokenStream(thisText.length());
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schema, fieldName, thisText);
        }

        int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);

        Highlighter highlighter;
        if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
            if (maxCharsToAnalyze < 0) {
                tstream = new CachingTokenFilter(tstream);
            } else {
                tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
            }

            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

            // after highlighter initialization, reset tstream since construction of highlighter already used it
            tstream.reset();
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }

        if (maxCharsToAnalyze < 0) {
            highlighter.setMaxDocCharsToAnalyze(thisText.length());
        } else {
            highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        }

        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText,
                    mergeContiguousFragments, numFragments);
            for (int k = 0; k < bestTextFragments.length; k++) {
                if (preserveMulti) {
                    if (bestTextFragments[k] != null) {
                        frags.add(bestTextFragments[k]);
                        --mvToMatch;
                    }
                } else {
                    if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                        frags.add(bestTextFragments[k]);
                        --mvToMatch;
                    }
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // sort such that the fragments with the highest score come first
    if (!preserveMulti) {
        Collections.sort(frags, new Comparator<TextFragment>() {
            @Override
            public int compare(TextFragment arg0, TextFragment arg1) {
                return Math.round(arg1.getScore() - arg0.getScore());
            }
        });
    }

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    if (frags.size() > 0) {
        ArrayList<String> fragTexts = new ArrayList<String>();
        for (TextFragment fragment : frags) {
            if (preserveMulti) {
                if (fragment != null) {
                    fragTexts.add(fragment.toString());
                }
            } else {
                if ((fragment != null) && (fragment.getScore() > 0)) {
                    fragTexts.add(fragment.toString());
                }
            }

            if (fragTexts.size() >= numFragments && !preserveMulti)
                break;
        }
        summaries = fragTexts.toArray(new String[0]);
        if (summaries.length > 0)
            docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
        alternateField(docSummaries, params, doc, fieldName);
    }
}

From source file:org.apache.solr.highlight.ParsedContentSolrHighlighter.java

License:Apache License

/**
 * Generates a list of Highlighted query fragments for each item in a list
 * of documents, or returns null if highlighting is disabled.
 * /*from   w w  w.j av a 2s.  c  om*/
 * @param docs
 *            query results
 * @param query
 *            the query
 * @param req
 *            the current request
 * @param defaultFields
 *            default list of fields to summarize
 * @return NamedList containing a NamedList for each document, which in
 *         turns contains sets (field, summary) pairs.
 */
@SuppressWarnings("unchecked")
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields)
        throws IOException {
    SolrParams params = req.getParams();
    if (!isHighlightingEnabled(params))
        return null;

    SolrIndexSearcher searcher = req.getSearcher();
    IndexSchema schema = searcher.getSchema();
    NamedList fragments = new SimpleOrderedMap();
    String[] fieldNames = getHighlightFields(query, req, defaultFields);
    Document[] readDocs = new Document[docs.size()];
    {
        // pre-fetch documents using the Searcher's doc cache
        Set<String> fset = new HashSet<String>();
        for (String f : fieldNames) {
            fset.add(f);
        }
        // fetch unique key if one exists.
        SchemaField keyField = schema.getUniqueKeyField();
        if (null != keyField)
            fset.add(keyField.getName());
        searcher.readDocs(readDocs, docs, fset);
    }

    // Highlight each document
    DocIterator iterator = docs.iterator();
    for (int i = 0; i < docs.size(); i++) {
        int docId = iterator.nextDoc();
        Document doc = readDocs[i];
        NamedList docSummaries = new SimpleOrderedMap();
        for (String fieldName : fieldNames) {
            fieldName = fieldName.trim();

            // begin
            String[] docTexts = doc.getValues(fieldName);
            //Highlight only the parsed content, instead of all fields
            if (IndexField.DEFAULT_SEARCH_FIELD.equals(fieldName)) {
                docTexts = doc.getValues(IndexField.PARSED_CONTENT_FIELD);
            }

            //                IndexFieldServices indexFieldServices = ConstellioSpringUtils.getIndexFieldServices();
            //                String collectionName = params.get(ConstellioSolrQueryParams.COLLECTION_NAME);
            //               RecordCollectionServices collectionServices = ConstellioSpringUtils.getRecordCollectionServices();
            //                RecordCollection collection = collectionServices.get(collectionName);
            //                IndexField defaultSearchField = collection.getDefaultSearchIndexField();
            //
            //                List<String> defaultSearchFieldDocTextsList = new ArrayList<String>();
            //                for (CopyField copyField : defaultSearchField.getCopyFieldsDest()) {
            //               IndexField sourceIndexField = copyField.getIndexFieldSource();
            //               if (sourceIndexField != null) {
            //                  String sourceIndexFieldName = sourceIndexField.getName();
            //                      String[] copyFieldValues = doc.getValues(sourceIndexFieldName);
            //                      if (copyFieldValues != null) {
            //                         for (int k = 0; k < copyFieldValues.length; k++) {
            //                        String copyFieldValue = copyFieldValues[k];
            //                        if (!defaultSearchFieldDocTextsList.contains(copyFieldValue)) {
            //                           defaultSearchFieldDocTextsList.add(copyFieldValue);
            //                        }
            //                     }
            //                      }
            //               }
            //            }
            //                docTexts = defaultSearchFieldDocTextsList.toArray(new String[0]);

            //                if ((docTexts == null || docTexts.length == 0)) {
            //                    RecordServices recordServices = ConstellioSpringUtils.getRecordServices();
            //                    Long recordId = new Long(doc.getField(IndexField.RECORD_ID_FIELD).stringValue());
            //                    Record record;
            //                    try {
            //                       record = recordServices.get(recordId, collection);
            //               } catch (Exception e) {
            //                  record = null;
            //                  e.printStackTrace();
            //               }
            //                    if (record != null) {
            //                        List<Object> fieldValues = indexFieldServices.extractFieldValues(record, defaultSearchField);
            //
            //                        List<String> docTextsList = new ArrayList<String>();
            //                        for (Object fieldValue : fieldValues) {
            //                            String strFieldValue = fieldValue != null ? fieldValue.toString() : null;
            //                            if (StringUtils.isNotBlank(strFieldValue)) {
            //                                docTextsList.add(strFieldValue);
            //                            }
            //                        }
            //
            //                        if (!docTextsList.isEmpty()) {
            //                            docTexts = docTextsList.toArray(new String[0]);
            //                        }
            //                    }
            //                }
            //                // end

            if (docTexts == null)
                continue;

            TokenStream tstream = null;
            int numFragments = getMaxSnippets(fieldName, params);
            boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

            String[] summaries = null;
            List<TextFragment> frags = new ArrayList<TextFragment>();
            for (int j = 0; j < docTexts.length; j++) {
                // create TokenStream
                try {
                    // attempt term vectors
                    tstream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId,
                            fieldName);
                } catch (IllegalArgumentException e) {
                    // fall back to anaylzer
                    tstream = new TokenOrderingFilter(
                            schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10);
                }

                Highlighter highlighter;
                if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) {
                    // wrap CachingTokenFilter around TokenStream for reuse
                    tstream = new CachingTokenFilter(tstream);

                    // get highlighter
                    highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

                    // after highlighter initialization, reset tstream since construction of highlighter
                    // already used it
                    tstream.reset();
                } else {
                    // use "the old way"
                    highlighter = getHighlighter(query, fieldName, req);
                }

                int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                        Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
                if (maxCharsToAnalyze < 0) {
                    highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
                } else {
                    highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
                }

                try {
                    TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, docTexts[j],
                            mergeContiguousFragments, numFragments);
                    for (int k = 0; k < bestTextFragments.length; k++) {
                        if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                            frags.add(bestTextFragments[k]);
                        }
                    }
                } catch (InvalidTokenOffsetsException e) {
                    throw new RuntimeException(e);
                }
            }
            // sort such that the fragments with the highest score come first
            Collections.sort(frags, new Comparator<TextFragment>() {
                public int compare(TextFragment arg0, TextFragment arg1) {
                    return Math.round(arg1.getScore() - arg0.getScore());
                }
            });

            // convert fragments back into text
            // TODO: we can include score and position information in output as snippet attributes
            if (frags.size() > 0) {
                ArrayList<String> fragTexts = new ArrayList<String>();
                for (TextFragment fragment : frags) {
                    if ((fragment != null) && (fragment.getScore() > 0)) {
                        //                            fragTexts.add(fragment.toString());
                        fragTexts.add(StringEscapeUtils.escapeHtml(fragment.toString()));
                    }
                    if (fragTexts.size() >= numFragments)
                        break;
                }
                summaries = fragTexts.toArray(new String[0]);
                if (summaries.length > 0)
                    docSummaries.add(fieldName, summaries);
            }
            // no summeries made, copy text from alternate field
            if (summaries == null || summaries.length == 0) {
                String alternateField = req.getParams().getFieldParam(fieldName,
                        HighlightParams.ALTERNATE_FIELD);
                if (alternateField != null && alternateField.length() > 0) {
                    String[] altTexts = doc.getValues(alternateField);
                    if (altTexts != null && altTexts.length > 0) {
                        int alternateFieldLen = req.getParams().getFieldInt(fieldName,
                                HighlightParams.ALTERNATE_FIELD_LENGTH, 0);
                        if (alternateFieldLen <= 0) {
                            docSummaries.add(fieldName, altTexts);
                        } else {
                            List<String> altList = new ArrayList<String>();
                            int len = 0;
                            for (String altText : altTexts) {
                                altList.add(len + altText.length() > alternateFieldLen
                                        ? altText.substring(0, alternateFieldLen - len)
                                        : altText);
                                len += altText.length();
                                if (len >= alternateFieldLen)
                                    break;
                            }
                            docSummaries.add(fieldName, altList);
                        }
                    }
                }
            }

        }
        String printId = schema.printableUniqueKey(doc);
        fragments.add(printId == null ? null : printId, docSummaries);
    }
    return fragments;
}