Example usage for org.apache.lucene.search.highlight Highlighter setMaxDocCharsToAnalyze

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter setMaxDocCharsToAnalyze.

Prototype

public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze)

Source Link

Usage

From source file:org.apache.solr.handler.component.AlfrescoSolrHighlighter.java

License:Open Source License

/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query,
        IndexReader reader, SolrQueryRequest req) throws IOException {
    final SolrParams params = req.getParams();
    final String fieldName = schemaField.getName();

    final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
            (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);

    // Technically this is the max *fragments* (snippets), not max values:
    int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
    if (mvToExamine <= 0 || mvToMatch <= 0) {
        return null;
    }//from   w  w  w.j a v  a  2  s .c o m

    int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
    if (maxCharsToAnalyze < 0) {//e.g. -1
        maxCharsToAnalyze = Integer.MAX_VALUE;
    }

    List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
    if (fieldValues.isEmpty()) {
        return null;
    }

    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);

    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    List<TextFragment> frags = new ArrayList<>();

    //Try term vectors, which is faster
    //  note: offsets are minimally sufficient for this HL.
    final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
    final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields,
            maxCharsToAnalyze - 1);
    //  We need to wrap in OffsetWindowTokenFilter if multi-valued
    final OffsetWindowTokenFilter tvWindowStream;
    if (tvStream != null && fieldValues.size() > 1) {
        tvWindowStream = new OffsetWindowTokenFilter(tvStream);
    } else {
        tvWindowStream = null;
    }

    for (String thisText : fieldValues) {
        if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
            break;
        }

        TokenStream tstream;
        if (tvWindowStream != null) {
            // if we have a multi-valued field with term vectors, then get the next offset window
            tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
        } else if (tvStream != null) {
            tstream = tvStream; // single-valued with term vectors
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schemaField, thisText);
        }

        Highlighter highlighter;
        if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
            // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
            // needs to implement reset() efficiently.

            //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
            //  It should be okay if OffsetLimit won't get applied in this case.
            final TokenStream tempTokenStream;
            if (tstream != tvStream) {
                if (maxCharsToAnalyze >= thisText.length()) {
                    tempTokenStream = new CachingTokenFilter(tstream);
                } else {
                    tempTokenStream = new CachingTokenFilter(
                            new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
                }
            } else {
                tempTokenStream = tstream;
            }

            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);

            // if the CachingTokenFilter was consumed then use it going forward.
            if (tempTokenStream instanceof CachingTokenFilter
                    && ((CachingTokenFilter) tempTokenStream).isCached()) {
                tstream = tempTokenStream;
            }
            //tstream.reset(); not needed; getBestTextFragments will reset it.
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }

        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        maxCharsToAnalyze -= thisText.length();

        // Highlight!
        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream,
                    fixLocalisedText(thisText), mergeContiguousFragments, numFragments);
            for (TextFragment bestTextFragment : bestTextFragments) {
                if (bestTextFragment == null)//can happen via mergeContiguousFragments
                    continue;
                // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
                if (bestTextFragment.getScore() > 0 || preserveMulti) {
                    frags.add(bestTextFragment);
                    if (bestTextFragment.getScore() > 0)
                        --mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    } //end field value loop

    // Put the fragments onto the Solr response (docSummaries)
    if (frags.size() > 0) {
        // sort such that the fragments with the highest score come first
        if (!preserveMulti) {
            Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
        }

        // Truncate list to hl.snippets, but not when hl.preserveMulti
        if (frags.size() > numFragments && !preserveMulti) {
            frags = frags.subList(0, numFragments);
        }
        return getResponseForFragments(frags, req);
    }
    return null;//no highlights for this field
}

From source file:org.apache.solr.highlight.DefaultSolrHighlighter.java

License:Apache License

private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId,
        Document doc, String fieldName) throws IOException {
    final SolrIndexSearcher searcher = req.getSearcher();
    final IndexSchema schema = searcher.getSchema();

    // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
    // so we disable them until fixed (see LUCENE-3080)!
    // BEGIN: Hack
    final SchemaField schemaField = schema.getFieldOrNull(fieldName);
    if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField)
            || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField)))
        return;// ww  w.j a  v  a  2s .c o  m
    // END: Hack

    SolrParams params = req.getParams();

    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);

    List<IndexableField> allFields = doc.getFields();
    if (allFields != null && allFields.size() == 0)
        return; // No explicit contract that getFields returns != null,
    // although currently it can't.

    TokenStream tstream = null;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    String[] summaries = null;
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
    if (tvStream != null) {
        tots = new TermOffsetsTokenStream(tvStream);
    }
    int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
            Integer.toString(Integer.MAX_VALUE)));
    int mvToMatch = Integer.parseInt(
            req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.toString(Integer.MAX_VALUE)));

    for (IndexableField thisField : allFields) {
        if (mvToExamine <= 0 || mvToMatch <= 0)
            break;

        if (!thisField.name().equals(fieldName))
            continue; // Is there a better way to do this?

        --mvToExamine;
        String thisText = thisField.stringValue();

        if (tots != null) {
            // if we're using TermOffsets optimization, then get the next
            // field value's TokenStream (i.e. get field j's TokenStream) from tots:
            tstream = tots.getMultiValuedTokenStream(thisText.length());
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schema, fieldName, thisText);
        }

        int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);

        Highlighter highlighter;
        if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
            if (maxCharsToAnalyze < 0) {
                tstream = new CachingTokenFilter(tstream);
            } else {
                tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
            }

            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

            // after highlighter initialization, reset tstream since construction of highlighter already used it
            tstream.reset();
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }

        if (maxCharsToAnalyze < 0) {
            highlighter.setMaxDocCharsToAnalyze(thisText.length());
        } else {
            highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        }

        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText,
                    mergeContiguousFragments, numFragments);
            for (int k = 0; k < bestTextFragments.length; k++) {
                if (preserveMulti) {
                    if (bestTextFragments[k] != null) {
                        frags.add(bestTextFragments[k]);
                        --mvToMatch;
                    }
                } else {
                    if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                        frags.add(bestTextFragments[k]);
                        --mvToMatch;
                    }
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // sort such that the fragments with the highest score come first
    if (!preserveMulti) {
        Collections.sort(frags, new Comparator<TextFragment>() {
            @Override
            public int compare(TextFragment arg0, TextFragment arg1) {
                return Math.round(arg1.getScore() - arg0.getScore());
            }
        });
    }

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    if (frags.size() > 0) {
        ArrayList<String> fragTexts = new ArrayList<String>();
        for (TextFragment fragment : frags) {
            if (preserveMulti) {
                if (fragment != null) {
                    fragTexts.add(fragment.toString());
                }
            } else {
                if ((fragment != null) && (fragment.getScore() > 0)) {
                    fragTexts.add(fragment.toString());
                }
            }

            if (fragTexts.size() >= numFragments && !preserveMulti)
                break;
        }
        summaries = fragTexts.toArray(new String[0]);
        if (summaries.length > 0)
            docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
        alternateField(docSummaries, params, doc, fieldName);
    }
}

From source file:org.apache.solr.highlight.ParsedContentSolrHighlighter.java

License:Apache License

/**
 * Generates a list of Highlighted query fragments for each item in a list
 * of documents, or returns null if highlighting is disabled.
 * //from  w w w  . j a  va  2s. c o m
 * @param docs
 *            query results
 * @param query
 *            the query
 * @param req
 *            the current request
 * @param defaultFields
 *            default list of fields to summarize
 * @return NamedList containing a NamedList for each document, which in
 *         turns contains sets (field, summary) pairs.
 */
@SuppressWarnings("unchecked")
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields)
        throws IOException {
    SolrParams params = req.getParams();
    if (!isHighlightingEnabled(params))
        return null;

    SolrIndexSearcher searcher = req.getSearcher();
    IndexSchema schema = searcher.getSchema();
    NamedList fragments = new SimpleOrderedMap();
    String[] fieldNames = getHighlightFields(query, req, defaultFields);
    Document[] readDocs = new Document[docs.size()];
    {
        // pre-fetch documents using the Searcher's doc cache
        Set<String> fset = new HashSet<String>();
        for (String f : fieldNames) {
            fset.add(f);
        }
        // fetch unique key if one exists.
        SchemaField keyField = schema.getUniqueKeyField();
        if (null != keyField)
            fset.add(keyField.getName());
        searcher.readDocs(readDocs, docs, fset);
    }

    // Highlight each document
    DocIterator iterator = docs.iterator();
    for (int i = 0; i < docs.size(); i++) {
        int docId = iterator.nextDoc();
        Document doc = readDocs[i];
        NamedList docSummaries = new SimpleOrderedMap();
        for (String fieldName : fieldNames) {
            fieldName = fieldName.trim();

            // begin
            String[] docTexts = doc.getValues(fieldName);
            //Highlight only the parsed content, instead of all fields
            if (IndexField.DEFAULT_SEARCH_FIELD.equals(fieldName)) {
                docTexts = doc.getValues(IndexField.PARSED_CONTENT_FIELD);
            }

            //                IndexFieldServices indexFieldServices = ConstellioSpringUtils.getIndexFieldServices();
            //                String collectionName = params.get(ConstellioSolrQueryParams.COLLECTION_NAME);
            //               RecordCollectionServices collectionServices = ConstellioSpringUtils.getRecordCollectionServices();
            //                RecordCollection collection = collectionServices.get(collectionName);
            //                IndexField defaultSearchField = collection.getDefaultSearchIndexField();
            //
            //                List<String> defaultSearchFieldDocTextsList = new ArrayList<String>();
            //                for (CopyField copyField : defaultSearchField.getCopyFieldsDest()) {
            //               IndexField sourceIndexField = copyField.getIndexFieldSource();
            //               if (sourceIndexField != null) {
            //                  String sourceIndexFieldName = sourceIndexField.getName();
            //                      String[] copyFieldValues = doc.getValues(sourceIndexFieldName);
            //                      if (copyFieldValues != null) {
            //                         for (int k = 0; k < copyFieldValues.length; k++) {
            //                        String copyFieldValue = copyFieldValues[k];
            //                        if (!defaultSearchFieldDocTextsList.contains(copyFieldValue)) {
            //                           defaultSearchFieldDocTextsList.add(copyFieldValue);
            //                        }
            //                     }
            //                      }
            //               }
            //            }
            //                docTexts = defaultSearchFieldDocTextsList.toArray(new String[0]);

            //                if ((docTexts == null || docTexts.length == 0)) {
            //                    RecordServices recordServices = ConstellioSpringUtils.getRecordServices();
            //                    Long recordId = new Long(doc.getField(IndexField.RECORD_ID_FIELD).stringValue());
            //                    Record record;
            //                    try {
            //                       record = recordServices.get(recordId, collection);
            //               } catch (Exception e) {
            //                  record = null;
            //                  e.printStackTrace();
            //               }
            //                    if (record != null) {
            //                        List<Object> fieldValues = indexFieldServices.extractFieldValues(record, defaultSearchField);
            //
            //                        List<String> docTextsList = new ArrayList<String>();
            //                        for (Object fieldValue : fieldValues) {
            //                            String strFieldValue = fieldValue != null ? fieldValue.toString() : null;
            //                            if (StringUtils.isNotBlank(strFieldValue)) {
            //                                docTextsList.add(strFieldValue);
            //                            }
            //                        }
            //
            //                        if (!docTextsList.isEmpty()) {
            //                            docTexts = docTextsList.toArray(new String[0]);
            //                        }
            //                    }
            //                }
            //                // end

            if (docTexts == null)
                continue;

            TokenStream tstream = null;
            int numFragments = getMaxSnippets(fieldName, params);
            boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

            String[] summaries = null;
            List<TextFragment> frags = new ArrayList<TextFragment>();
            for (int j = 0; j < docTexts.length; j++) {
                // create TokenStream
                try {
                    // attempt term vectors
                    tstream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId,
                            fieldName);
                } catch (IllegalArgumentException e) {
                    // fall back to anaylzer
                    tstream = new TokenOrderingFilter(
                            schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10);
                }

                Highlighter highlighter;
                if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) {
                    // wrap CachingTokenFilter around TokenStream for reuse
                    tstream = new CachingTokenFilter(tstream);

                    // get highlighter
                    highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

                    // after highlighter initialization, reset tstream since construction of highlighter
                    // already used it
                    tstream.reset();
                } else {
                    // use "the old way"
                    highlighter = getHighlighter(query, fieldName, req);
                }

                int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                        Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
                if (maxCharsToAnalyze < 0) {
                    highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
                } else {
                    highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
                }

                try {
                    TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, docTexts[j],
                            mergeContiguousFragments, numFragments);
                    for (int k = 0; k < bestTextFragments.length; k++) {
                        if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                            frags.add(bestTextFragments[k]);
                        }
                    }
                } catch (InvalidTokenOffsetsException e) {
                    throw new RuntimeException(e);
                }
            }
            // sort such that the fragments with the highest score come first
            Collections.sort(frags, new Comparator<TextFragment>() {
                public int compare(TextFragment arg0, TextFragment arg1) {
                    return Math.round(arg1.getScore() - arg0.getScore());
                }
            });

            // convert fragments back into text
            // TODO: we can include score and position information in output as snippet attributes
            if (frags.size() > 0) {
                ArrayList<String> fragTexts = new ArrayList<String>();
                for (TextFragment fragment : frags) {
                    if ((fragment != null) && (fragment.getScore() > 0)) {
                        //                            fragTexts.add(fragment.toString());
                        fragTexts.add(StringEscapeUtils.escapeHtml(fragment.toString()));
                    }
                    if (fragTexts.size() >= numFragments)
                        break;
                }
                summaries = fragTexts.toArray(new String[0]);
                if (summaries.length > 0)
                    docSummaries.add(fieldName, summaries);
            }
            // no summeries made, copy text from alternate field
            if (summaries == null || summaries.length == 0) {
                String alternateField = req.getParams().getFieldParam(fieldName,
                        HighlightParams.ALTERNATE_FIELD);
                if (alternateField != null && alternateField.length() > 0) {
                    String[] altTexts = doc.getValues(alternateField);
                    if (altTexts != null && altTexts.length > 0) {
                        int alternateFieldLen = req.getParams().getFieldInt(fieldName,
                                HighlightParams.ALTERNATE_FIELD_LENGTH, 0);
                        if (alternateFieldLen <= 0) {
                            docSummaries.add(fieldName, altTexts);
                        } else {
                            List<String> altList = new ArrayList<String>();
                            int len = 0;
                            for (String altText : altTexts) {
                                altList.add(len + altText.length() > alternateFieldLen
                                        ? altText.substring(0, alternateFieldLen - len)
                                        : altText);
                                len += altText.length();
                                if (len >= alternateFieldLen)
                                    break;
                            }
                            docSummaries.add(fieldName, altList);
                        }
                    }
                }
            }

        }
        String printId = schema.printableUniqueKey(doc);
        fragments.add(printId == null ? null : printId, docSummaries);
    }
    return fragments;
}

From source file:org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter.java

License:Apache License

@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
    SearchContextHighlight.Field field = highlighterContext.field;
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    FieldMapper mapper = highlighterContext.mapper;

    Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML
            : HighlightUtils.Encoders.DEFAULT;

    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> mappers = new HashMap<>();
        hitContext.cache().put(CACHE_KEY, mappers);
    }/*from  www  . j av a 2s . c  o  m*/
    @SuppressWarnings("unchecked")
    Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter>) hitContext
            .cache().get(CACHE_KEY);

    org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper);
    if (entry == null) {
        QueryScorer queryScorer = new CustomQueryScorer(highlighterContext.query,
                field.fieldOptions().requireFieldMatch() ? mapper.fieldType().name() : null);
        queryScorer.setExpandMultiTermQuery(true);
        Fragmenter fragmenter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            fragmenter = new NullFragmenter();
        } else if (field.fieldOptions().fragmenter() == null) {
            fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
        } else if ("simple".equals(field.fieldOptions().fragmenter())) {
            fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize());
        } else if ("span".equals(field.fieldOptions().fragmenter())) {
            fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
        } else {
            throw new IllegalArgumentException("unknown fragmenter option [" + field.fieldOptions().fragmenter()
                    + "] for the field [" + highlighterContext.fieldName + "]");
        }
        Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0],
                field.fieldOptions().postTags()[0]);

        entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer);
        entry.setTextFragmenter(fragmenter);
        // always highlight across all data
        entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

        cache.put(mapper, entry);
    }

    // a HACK to make highlighter do highlighting, even though its using the single frag list builder
    int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1
            : field.fieldOptions().numberOfFragments();
    ArrayList<TextFragment> fragsList = new ArrayList<>();
    List<Object> textsToHighlight;
    Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers()
            .indexAnalyzer();

    try {
        textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);

        for (Object textToHighlight : textsToHighlight) {
            String text = textToHighlight.toString();

            try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
                if (!tokenStream.hasAttribute(CharTermAttribute.class)
                        || !tokenStream.hasAttribute(OffsetAttribute.class)) {
                    // can't perform highlighting if the stream has no terms (binary token stream) or no offsets
                    continue;
                }
                TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false,
                        numberOfFragments);
                for (TextFragment bestTextFragment : bestTextFragments) {
                    if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
                        fragsList.add(bestTextFragment);
                    }
                }
            }
        }
    } catch (Exception e) {
        if (ExceptionsHelper.unwrap(e, BytesRefHash.MaxBytesLengthExceededException.class) != null) {
            // this can happen if for example a field is not_analyzed and ignore_above option is set.
            // the field will be ignored when indexing but the huge term is still in the source and
            // the plain highlighter will parse the source and try to analyze it.
            return null;
        } else {
            throw new FetchPhaseExecutionException(context,
                    "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
        }
    }
    if (field.fieldOptions().scoreOrdered()) {
        CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() {
            @Override
            public int compare(TextFragment o1, TextFragment o2) {
                return Math.round(o2.getScore() - o1.getScore());
            }
        });
    }
    String[] fragments;
    // number_of_fragments is set to 0 but we have a multivalued field
    if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) {
        fragments = new String[fragsList.size()];
        for (int i = 0; i < fragsList.size(); i++) {
            fragments[i] = fragsList.get(i).toString();
        }
    } else {
        // refine numberOfFragments if needed
        numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments;
        fragments = new String[numberOfFragments];
        for (int i = 0; i < fragments.length; i++) {
            fragments[i] = fragsList.get(i).toString();
        }
    }

    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
    }

    int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
    if (noMatchSize > 0 && textsToHighlight.size() > 0) {
        // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary.
        String fieldContents = textsToHighlight.get(0).toString();
        int end;
        try {
            end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().name(),
                    fieldContents);
        } catch (Exception e) {
            throw new FetchPhaseExecutionException(context,
                    "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
        }
        if (end > 0) {
            return new HighlightField(highlighterContext.fieldName,
                    new Text[] { new Text(fieldContents.substring(0, end)) });
        }
    }
    return null;
}

From source file:org.elasticsearch.search.highlight.PlainHighlighter.java

License:Apache License

public HighlightField highlight(HighlighterContext highlighterContext) {
    SearchContextHighlight.Field field = highlighterContext.field;
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    FieldMapper<?> mapper = highlighterContext.mapper;

    Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML
            : HighlightUtils.Encoders.DEFAULT;

    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter> mappers = Maps.newHashMap();
        hitContext.cache().put(CACHE_KEY, mappers);
    }/*from   ww w. java2  s. c  o m*/
    @SuppressWarnings("unchecked")
    Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper<?>, org.apache.lucene.search.highlight.Highlighter>) hitContext
            .cache().get(CACHE_KEY);

    org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper);
    if (entry == null) {
        Query query = highlighterContext.query.originalQuery();
        QueryScorer queryScorer = new CustomQueryScorer(query,
                field.fieldOptions().requireFieldMatch() ? mapper.names().indexName() : null);
        queryScorer.setExpandMultiTermQuery(true);
        Fragmenter fragmenter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            fragmenter = new NullFragmenter();
        } else if (field.fieldOptions().fragmenter() == null) {
            fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
        } else if ("simple".equals(field.fieldOptions().fragmenter())) {
            fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize());
        } else if ("span".equals(field.fieldOptions().fragmenter())) {
            fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
        } else {
            throw new ElasticsearchIllegalArgumentException(
                    "unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field ["
                            + highlighterContext.fieldName + "]");
        }
        Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0],
                field.fieldOptions().postTags()[0]);

        entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer);
        entry.setTextFragmenter(fragmenter);
        // always highlight across all data
        entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

        cache.put(mapper, entry);
    }

    // a HACK to make highlighter do highlighting, even though its using the single frag list builder
    int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1
            : field.fieldOptions().numberOfFragments();
    ArrayList<TextFragment> fragsList = new ArrayList<TextFragment>();
    List<Object> textsToHighlight;

    try {
        textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);

        for (Object textToHighlight : textsToHighlight) {
            String text = textToHighlight.toString();
            Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers()
                    .indexAnalyzer();
            TokenStream tokenStream = analyzer.tokenStream(mapper.names().indexName(), text);
            if (!tokenStream.hasAttribute(CharTermAttribute.class)
                    || !tokenStream.hasAttribute(OffsetAttribute.class)) {
                // can't perform highlighting if the stream has no terms (binary token stream) or no offsets
                continue;
            }
            TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false,
                    numberOfFragments);
            for (TextFragment bestTextFragment : bestTextFragments) {
                if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
                    fragsList.add(bestTextFragment);
                }
            }
        }
    } catch (Exception e) {
        throw new FetchPhaseExecutionException(context,
                "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
    }
    if (field.fieldOptions().scoreOrdered()) {
        CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() {
            public int compare(TextFragment o1, TextFragment o2) {
                return Math.round(o2.getScore() - o1.getScore());
            }
        });
    }
    String[] fragments;
    // number_of_fragments is set to 0 but we have a multivalued field
    if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) {
        fragments = new String[fragsList.size()];
        for (int i = 0; i < fragsList.size(); i++) {
            fragments[i] = fragsList.get(i).toString();
        }
    } else {
        // refine numberOfFragments if needed
        numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments;
        fragments = new String[numberOfFragments];
        for (int i = 0; i < fragments.length; i++) {
            fragments[i] = fragsList.get(i).toString();
        }
    }

    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, StringText.convertFromStringArray(fragments));
    }

    int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
    if (noMatchSize > 0 && textsToHighlight.size() > 0) {
        // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary.
        String fieldContents = textsToHighlight.get(0).toString();
        Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers()
                .indexAnalyzer();
        int end;
        try {
            end = findGoodEndForNoHighlightExcerpt(noMatchSize,
                    analyzer.tokenStream(mapper.names().indexName(), fieldContents));
        } catch (Exception e) {
            throw new FetchPhaseExecutionException(context,
                    "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
        }
        if (end > 0) {
            return new HighlightField(highlighterContext.fieldName,
                    new Text[] { new StringText(fieldContents.substring(0, end)) });
        }
    }
    return null;
}