In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter Highlighter.


public Highlighter(Formatter formatter, Scorer fragmentScorer) 

From source file:org.apache.nutch.searcher.Summarizer.java

License:Apache License

public static String getsummary(String queryString, String content, Analyzer analyzer) {
    if (queryString == null && content != null) {
        if (content.length() > SUM_LENGTH)
            return content.substring(0, (SUM_LENGTH) - 1);
        else/*w ww.j  a  va 2s. c  o m*/
            return content;
    } else if (queryString != null && content == null)
        return "";
    else if (queryString == null && content == null)
        return "";
    SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter(cssfront, cssend);

    org.apache.lucene.search.Query summarizerQuery = null;
    QueryParser queryParse = new QueryParser("content", analyzer);
    try {
        summarizerQuery = queryParse.parse(queryString);
    } catch (ParseException ex) {
        if (content.length() > SUM_LENGTH)
            return content.substring(0, (SUM_LENGTH) - 1);
            return content;
    QueryScorer qs = new QueryScorer(summarizerQuery);
    Highlighter highlighter = new Highlighter(sHtmlF, qs);
    highlighter.setTextFragmenter(new SimpleFragmenter(SUM_LENGTH));
    TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content));
    String str;
    try {
        str = highlighter.getBestFragment(tokenStream, content);
    } catch (IOException e) {
        str = null;
    if (str == null) {
        if (content.length() > SUM_LENGTH)
            str = content.substring(0, (SUM_LENGTH) - 1);
            str = content;
    return str;

From source file:org.apache.nutch.summary.lucene.LuceneSummarizer.java

License:Apache License

public Summary getSummary(String text, Query query) {

    String[] terms = query.getTerms();
    WeightedTerm[] weighted = new WeightedTerm[terms.length];
    for (int i = 0; i < terms.length; i++) {
        weighted[i] = new WeightedTerm(1.0f, terms[i]);
    }//w w w .  j av  a2s  .  c  om
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
        // TODO : The max number of fragments (3) should be configurable
        String[] result = highlighter.getBestFragments(tokens, text, 3);
        for (int i = 0; i < result.length; i++) {
            String[] parts = result[i].split(SEPARATOR);
            boolean highlight = false;
            for (int j = 0; j < parts.length; j++) {
                if (highlight) {
                    summary.add(new Highlight(parts[j]));
                } else {
                    summary.add(new Fragment(parts[j]));
                highlight = !highlight;
            summary.add(new Ellipsis());

        /* TODO MC  BUG resolved 0000029 - if query terms do not occur on text, an empty summary is returned. Now it sends the first tokens. */
        if (result == null || result.length == 0) {
            tokens = analyzer.tokenStream("content", new StringReader(text));

            Token firstToken = null, lastToken = null;
            Token token = null;
            int maxLen = 100; // the same as defined in SimpleFragmenter but it is private

            ArrayList<Token> titleTokens=new ArrayList<Token>();
            ArrayList<Token> textTokens=new ArrayList<Token>();
            boolean titleMatched=false;
            boolean hasMatched=false; // exit match after match title the first time               
            // remove title from text. compares pairs of text
            while ((titleMatched || !hasMatched) && (token=tokens.next())!=null) {
               if (token.type().equals("<WORD>")) {
                  if (titleTokens.size()==0) {
                  else if (textTokens.size()<titleTokens.size()) {
                  if (textTokens.size()==titleTokens.size()) {
                     // compare
                     for (int i=0;i<textTokens.size() && titleMatched;i++) {
             if (!textTokens.get(i).termText().equals(titleTokens.get(i).termText())) {
                     if (titleMatched) { // try to match a larger pattern
                     else { // remove rest of title from text
             if (hasMatched) {
             else { // add one more token to title
            if (textTokens.size()==0) {
               return summary;
            for (int i=0;i<textTokens.size() && textTokens.get(i).endOffset()-firstToken.startOffset()<maxLen;i++) {

            // read tokens until maxLen
            while ((token = tokens.next()) != null) {
                if (token.type().equals("<WORD>")) {
                    if (firstToken == null) {
                        firstToken = token;
                    } else if (token.endOffset() - firstToken.startOffset() < maxLen) {
                        lastToken = token;
                    } else {
            if (lastToken == null) {
                lastToken = firstToken;

            summary.add(new Fragment(text.substring(firstToken.startOffset(), lastToken.endOffset())));
            summary.add(new Ellipsis());
        /* TODO MC */

    } catch (Exception e) {
        // Nothing to do...
    return summary;

From source file:org.apache.zeppelin.search.LuceneSearch.java

License:Apache License

public List<Map<String, String>> query(String queryStr) {
    if (null == directory) {
        throw new IllegalStateException("Something went wrong on instance creation time, index dir is null");
    }/*from www  .  j  av  a2s .  co  m*/
    List<Map<String, String>> result = Collections.emptyList();
    try (IndexReader indexReader = DirectoryReader.open(directory)) {
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        Analyzer analyzer = new StandardAnalyzer();
        MultiFieldQueryParser parser = new MultiFieldQueryParser(
                new String[] { SEARCH_FIELD_TEXT, SEARCH_FIELD_TITLE }, analyzer);

        Query query = parser.parse(queryStr);
        logger.debug("Searching for: " + query.toString(SEARCH_FIELD_TEXT));

        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));

        result = doSearch(indexSearcher, query, analyzer, highlighter);
    } catch (IOException e) {
        logger.error("Failed to open index dir {}, make sure indexing finished OK", directory, e);
    } catch (ParseException e) {
        logger.error("Failed to parse query " + queryStr, e);
    return result;

From source file:org.carrot2.source.lucene.SimpleFieldMapper.java

License:Open Source License

private void resetHighlighter() {
    if (formatter != null) {
        this.highlighter = new Highlighter(formatter, new QueryScorer(query));
        this.highlighter.setEncoder(new DefaultEncoder());
    } else {/*w w  w . j ava  2 s  .  com*/
        this.highlighter = null;

From source file:org.eclipse.rdf4j.sail.lucene.LuceneIndex.java

License:Open Source License

 * Parse the passed query./*from  ww w. j  a v a 2  s . c  o  m*/
 * @param query
 *        string
 * @return the parsed query
 * @throws ParseException
 *         when the parsing brakes
protected Iterable<? extends DocumentScore> query(Resource subject, String query, URI propertyURI,
        boolean highlight) throws MalformedQueryException, IOException {
    Query q;
    try {
        q = getQueryParser(propertyURI).parse(query);
    } catch (ParseException e) {
        throw new MalformedQueryException(e);

    final Highlighter highlighter;
    if (highlight) {
        Formatter formatter = new SimpleHTMLFormatter(SearchFields.HIGHLIGHTER_PRE_TAG,
        highlighter = new Highlighter(formatter, new QueryScorer(q));
    } else {
        highlighter = null;

    TopDocs docs;
    if (subject != null) {
        docs = search(subject, q);
    } else {
        docs = search(q);
    return Iterables.transform(Arrays.asList(docs.scoreDocs), new Function<ScoreDoc, DocumentScore>() {

        public DocumentScore apply(ScoreDoc doc) {
            return new LuceneDocumentScore(doc, highlighter, LuceneIndex.this);

From source file:org.eclipse.rdf4j.sail.lucene.LuceneQuery.java

License:Open Source License

@Deprecated//w w w .ja  v  a2  s .c o  m
public void highlight(URI property) {
    Formatter formatter = new SimpleHTMLFormatter(SearchFields.HIGHLIGHTER_PRE_TAG,
    highlighter = new Highlighter(formatter, new QueryScorer(query));

From source file:org.eclipse.skalli.core.search.LuceneIndex.java

License:Open Source License

public synchronized SearchResult<T> moreLikeThis(T entity, String[] fields, int count) {
    long start = System.nanoTime();
    SearchResult<T> moreLikeThis = new SearchResult<T>();
    List<SearchHit<T>> searchHits = new LinkedList<SearchHit<T>>();
    PagingInfo pagingInfo = new PagingInfo(0, 0);
    int totalHitCount = 0;
    if (initialized) {
        IndexReader reader = null;/*from  w ww  . ja v a  2s  . c  om*/
        IndexSearcher searcher = null;
        try {
            reader = IndexReader.open(directory);
            searcher = new IndexSearcher(reader);
            ScoreDoc baseDoc = getDocByUUID(searcher, entity.getUuid());
            if (baseDoc != null) {
                MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
                Query query = mlt.like(baseDoc.doc);
                int numHits = Math.min(count + 1, entityService.size()); // count + 1: baseDoc will be one of the hits
                TopScoreDocCollector collector = TopScoreDocCollector.create(numHits, false);
                searcher.search(query, collector);

                List<String> fieldList = Arrays.asList(fields);
                Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
                for (ScoreDoc hit : collector.topDocs().scoreDocs) {
                    if (hit.doc != baseDoc.doc) {
                        Document doc = searcher.doc(hit.doc);
                        SearchHit<T> searchHit = getSearchHit(doc, fieldList, hit.score, highlighter);
                pagingInfo = new PagingInfo(0, count);
                totalHitCount = collector.getTotalHits() - 1;
        } catch (Exception e) {
                    MessageFormat.format("Searching for entities similiar to ''{0}'' failed", entity.getUuid()),
        } finally {

    long nanoDuration = System.nanoTime() - start;
    long milliDuration = Math.round(nanoDuration / 1000000d);

    return moreLikeThis;

From source file:org.eclipse.skalli.core.search.LuceneIndex.java

License:Open Source License

private <R extends SearchResult<T>> R search(final String[] fields, String facetFields[],
        final String queryString, PagingInfo pagingInfo, R ret) throws QueryParseException {
    long start = System.nanoTime();
    List<SearchHit<T>> resultList = new LinkedList<SearchHit<T>>();
    int totalHitCount = 0;
    if (pagingInfo == null) {
        pagingInfo = new PagingInfo(0, 10);
    }// w w w. j  a  v  a2  s . c o m
    if (StringUtils.equals("*", queryString) || StringUtils.isEmpty(queryString)) { //$NON-NLS-1$
        List<T> allEntities = entityService.getAll();
        List<T> sublist = allEntities.subList(Math.min(pagingInfo.getStart(), allEntities.size()),
                Math.min(pagingInfo.getStart() + pagingInfo.getCount(), allEntities.size()));
        totalHitCount = allEntities.size();
    } else if (initialized) {
        List<String> fieldList = Arrays.asList(fields);
        IndexReader reader = null;
        IndexSearcher searcher = null;
        try {
            reader = IndexReader.open(directory);
            searcher = new IndexSearcher(reader);
            QueryParser parser = new MultiFieldQueryParser(LUCENE_VERSION, fields, analyzer);
            Query query = getQuery(parser, queryString);

            // it is not possible that we have more hits than projects!
            int maxHits = entityService.size();
            int numHits = pagingInfo.getStart() + pagingInfo.getCount();
            if (numHits < 0 || numHits > maxHits) {
                numHits = maxHits;
            if (numHits > 0) {
                TopDocsCollector<ScoreDoc> collector;
                if (facetFields == null) {
                    collector = TopScoreDocCollector.create(numHits, false);
                } else {
                    collector = new FacetedCollector(facetFields, searcher.getIndexReader(), numHits);

                searcher.search(query, collector);
                Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
                TopDocs topDocs = collector.topDocs(pagingInfo.getStart(), pagingInfo.getCount());
                for (ScoreDoc hit : topDocs.scoreDocs) {
                    Document doc = searcher.doc(hit.doc);
                    SearchHit<T> searchHit = getSearchHit(doc, fieldList, hit.score, highlighter);

                totalHitCount = collector.getTotalHits();
                if (collector instanceof FacetedCollector && ret instanceof FacetedSearchResult) {
                    ((FacetedSearchResult<T>) ret).setFacetInfo(((FacetedCollector) collector).getFacetsMap());
        } catch (Exception e) {
            LOG.error(MessageFormat.format("Searching with query ''{0}'' failed", queryString), e);
        } finally {

    long nanoDuration = System.nanoTime() - start;
    long milliDuration = Math.round(nanoDuration / 1000000d);
    return ret;

From source file:org.eclipse.smila.search.lucene.index.IndexConnection.java

License:Open Source License

 * {@inheritDoc}/*from w w w . ja  v a2s  .c  o m*/
protected void addHighlightAnnotation(final IQueryExpression dQE, final String recordId, final AnyMap highlight,
        final int fieldNo, final String attributeName, final String indexName) throws IndexException {
    final DIndexStructure dIS = _index.getIndexStructure();
    final Document doc = _hits.get(recordId);

    final DIndexField field = (DIndexField) dIS.getField(fieldNo);
    if (field == null) {
        throw new IndexException("invalid field no in result [" + fieldNo + "]");

    final String text = doc.get(field.getName());
    if (text != null) {
        final AnnotationFormatter formatter = new AnnotationFormatter(highlight.getFactory());
        try {
            final Query hlQuery = getHighlightQuery(fieldNo, dQE);
            if (hlQuery != null) {
                final TokenStream tokenStream = _analyzer.tokenStream(attributeName, new StringReader(text));
                final Highlighter highlighter = new Highlighter(formatter, new QueryScorer(hlQuery));
                // this triggers the execution of the Formatter
                highlighter.getBestTextFragments(tokenStream, text, MERGE_CONTIGUOSE_FRAGMENTS,

                final AnyMap attributeHighlight = highlight.getFactory().createAnyMap();
                final AnySeq highlightingPositions = formatter.getHighlightPositions();
                attributeHighlight.put(SearchResultConstants.HIGHLIGHT_POSITIONS, highlightingPositions);
                highlight.put(attributeName, attributeHighlight);
        } catch (final Exception ex) {
            throw new IndexException("error getting result value for record with id " + recordId, ex);

From source file:org.haplo.app.SearchResultExcerptHighlighter.java

License:Mozilla Public License

static public String[] bestHighlightedExcerpts(String escapedText, String searchTerms, int maxExcerptLength) {
    try {/*from  ww w . j  a v  a 2  s  . c  o m*/
        // Scorer selects the terms which need highlighting. Created from a 'query' based on the extracted search terms.
        Scorer scorer;
        Fragmenter fragmenter;
        if (searchTerms != null && searchTerms.length() > 0) {
            QueryParser queryParser = new QueryParser("FIELD", new StandardAnalyzer());
            Query query = queryParser.parse(searchTerms);
            scorer = new QueryScorer(query);
            fragmenter = new SimpleSpanFragmenter((QueryScorer) scorer, maxExcerptLength);
        } else {
            scorer = new NoHighlightingScorer();
            fragmenter = new SimpleFragmenter(maxExcerptLength);

        // Parse the escaped text into tokens, which retain the positions in the text
        StandardAnalyzer analyser = new StandardAnalyzer();
        TokenStream tokenStream = analyser.tokenStream("FIELD", new StringReader(escapedText));

        // Finally, do the highlighting!
        Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"), scorer);
        return highlighter.getBestFragments(tokenStream, escapedText, NUMBER_OF_FRAGMENTS);
    } catch (Exception e) {
        Logger.getLogger("org.haplo.app").info("Exception in SearchResultExcerptHighlighter: ", e);
        return null;