Example usage for org.apache.lucene.search.highlight Highlighter Highlighter

List of usage examples for org.apache.lucene.search.highlight Highlighter Highlighter

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter Highlighter.

Prototype

public Highlighter(Formatter formatter, Scorer fragmentScorer) 

Source Link

Usage

From source file:org.apache.nutch.searcher.Summarizer.java

License:Apache License

public static String getsummary(String queryString, String content, Analyzer analyzer) {
    if (queryString == null && content != null) {
        if (content.length() > SUM_LENGTH)
            return content.substring(0, (SUM_LENGTH) - 1);
        else/*w ww.j  a  va 2s. c  o m*/
            return content;
    } else if (queryString != null && content == null)
        return "";
    else if (queryString == null && content == null)
        return "";
    SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter(cssfront, cssend);

    org.apache.lucene.search.Query summarizerQuery = null;
    QueryParser queryParse = new QueryParser("content", analyzer);
    try {
        summarizerQuery = queryParse.parse(queryString);
    } catch (ParseException ex) {
        if (content.length() > SUM_LENGTH)
            return content.substring(0, (SUM_LENGTH) - 1);
        else
            return content;
    }
    QueryScorer qs = new QueryScorer(summarizerQuery);
    Highlighter highlighter = new Highlighter(sHtmlF, qs);
    highlighter.setTextFragmenter(new SimpleFragmenter(SUM_LENGTH));
    TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content));
    String str;
    try {
        str = highlighter.getBestFragment(tokenStream, content);
    } catch (IOException e) {
        str = null;
    }
    if (str == null) {
        if (content.length() > SUM_LENGTH)
            str = content.substring(0, (SUM_LENGTH) - 1);
        else
            str = content;
    }
    return str;
}

From source file:org.apache.nutch.summary.lucene.LuceneSummarizer.java

License:Apache License

public Summary getSummary(String text, Query query) {

    String[] terms = query.getTerms();
    WeightedTerm[] weighted = new WeightedTerm[terms.length];
    for (int i = 0; i < terms.length; i++) {
        weighted[i] = new WeightedTerm(1.0f, terms[i]);
    }//w w w .  j av  a2s  .  c  om
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
        // TODO : The max number of fragments (3) should be configurable
        String[] result = highlighter.getBestFragments(tokens, text, 3);
        for (int i = 0; i < result.length; i++) {
            String[] parts = result[i].split(SEPARATOR);
            boolean highlight = false;
            for (int j = 0; j < parts.length; j++) {
                if (highlight) {
                    summary.add(new Highlight(parts[j]));
                } else {
                    summary.add(new Fragment(parts[j]));
                }
                highlight = !highlight;
            }
            summary.add(new Ellipsis());
        }

        /* TODO MC  BUG resolved 0000029 - if query terms do not occur on text, an empty summary is returned. Now it sends the first tokens. */
        if (result == null || result.length == 0) {
            tokens = analyzer.tokenStream("content", new StringReader(text));

            Token firstToken = null, lastToken = null;
            Token token = null;
            int maxLen = 100; // the same as defined in SimpleFragmenter but it is private

            /*
            ArrayList<Token> titleTokens=new ArrayList<Token>();
            ArrayList<Token> textTokens=new ArrayList<Token>();
            boolean titleMatched=false;
            boolean hasMatched=false; // exit match after match title the first time               
                    
            // remove title from text. compares pairs of text
            while ((titleMatched || !hasMatched) && (token=tokens.next())!=null) {
                       
               if (token.type().equals("<WORD>")) {
                       
                  if (titleTokens.size()==0) {
                     titleTokens.add(token);
                  }
                  else if (textTokens.size()<titleTokens.size()) {
                     textTokens.add(token);
                  }
                       
                  if (textTokens.size()==titleTokens.size()) {
                     // compare
                     titleMatched=true;
                     for (int i=0;i<textTokens.size() && titleMatched;i++) {
             if (!textTokens.get(i).termText().equals(titleTokens.get(i).termText())) {
                titleMatched=false;     
             }                     
                     }
                     if (titleMatched) { // try to match a larger pattern
             titleTokens.add(textTokens.get(0));
             textTokens.remove(0);
             hasMatched=true;
                     }
                     else { // remove rest of title from text
             if (hasMatched) {
                firstToken=textTokens.get(titleTokens.size()-2);                                              
             }
             else { // add one more token to title
                titleTokens.add(textTokens.get(0));
                 textTokens.remove(0);
             }
                     }
                  }
               }          
            }
                    
            if (textTokens.size()==0) {
               return summary;
            }
                                  
            for (int i=0;i<textTokens.size() && textTokens.get(i).endOffset()-firstToken.startOffset()<maxLen;i++) {
               lastToken=textTokens.get(i);
            }
            */

            // read tokens until maxLen
            while ((token = tokens.next()) != null) {
                if (token.type().equals("<WORD>")) {
                    if (firstToken == null) {
                        firstToken = token;
                    } else if (token.endOffset() - firstToken.startOffset() < maxLen) {
                        lastToken = token;
                    } else {
                        break;
                    }
                }
            }
            if (lastToken == null) {
                lastToken = firstToken;
            }

            summary.add(new Fragment(text.substring(firstToken.startOffset(), lastToken.endOffset())));
            summary.add(new Ellipsis());
        }
        /* TODO MC */

    } catch (Exception e) {
        // Nothing to do...
    }
    return summary;
}

From source file:org.apache.zeppelin.search.LuceneSearch.java

License:Apache License

@Override
public List<Map<String, String>> query(String queryStr) {
    if (null == directory) {
        throw new IllegalStateException("Something went wrong on instance creation time, index dir is null");
    }/*from www  .  j  av  a2s .  co  m*/
    List<Map<String, String>> result = Collections.emptyList();
    try (IndexReader indexReader = DirectoryReader.open(directory)) {
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        Analyzer analyzer = new StandardAnalyzer();
        MultiFieldQueryParser parser = new MultiFieldQueryParser(
                new String[] { SEARCH_FIELD_TEXT, SEARCH_FIELD_TITLE }, analyzer);

        Query query = parser.parse(queryStr);
        logger.debug("Searching for: " + query.toString(SEARCH_FIELD_TEXT));

        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));

        result = doSearch(indexSearcher, query, analyzer, highlighter);
    } catch (IOException e) {
        logger.error("Failed to open index dir {}, make sure indexing finished OK", directory, e);
    } catch (ParseException e) {
        logger.error("Failed to parse query " + queryStr, e);
    }
    return result;
}

From source file:org.carrot2.source.lucene.SimpleFieldMapper.java

License:Open Source License

private void resetHighlighter() {
    if (formatter != null) {
        this.highlighter = new Highlighter(formatter, new QueryScorer(query));
        this.highlighter.setEncoder(new DefaultEncoder());
    } else {/*w w  w . j ava  2 s  .  com*/
        this.highlighter = null;
    }
}

From source file:org.eclipse.rdf4j.sail.lucene.LuceneIndex.java

License:Open Source License

/**
 * Parse the passed query./*from  ww w. j  a v a 2  s . c  o  m*/
 * 
 * @param query
 *        string
 * @return the parsed query
 * @throws ParseException
 *         when the parsing brakes
 */
@Override
protected Iterable<? extends DocumentScore> query(Resource subject, String query, URI propertyURI,
        boolean highlight) throws MalformedQueryException, IOException {
    Query q;
    try {
        q = getQueryParser(propertyURI).parse(query);
    } catch (ParseException e) {
        throw new MalformedQueryException(e);
    }

    final Highlighter highlighter;
    if (highlight) {
        Formatter formatter = new SimpleHTMLFormatter(SearchFields.HIGHLIGHTER_PRE_TAG,
                SearchFields.HIGHLIGHTER_POST_TAG);
        highlighter = new Highlighter(formatter, new QueryScorer(q));
    } else {
        highlighter = null;
    }

    TopDocs docs;
    if (subject != null) {
        docs = search(subject, q);
    } else {
        docs = search(q);
    }
    return Iterables.transform(Arrays.asList(docs.scoreDocs), new Function<ScoreDoc, DocumentScore>() {

        @Override
        public DocumentScore apply(ScoreDoc doc) {
            return new LuceneDocumentScore(doc, highlighter, LuceneIndex.this);
        }
    });
}

From source file:org.eclipse.rdf4j.sail.lucene.LuceneQuery.java

License:Open Source License

@Override
@Deprecated//w w w .ja  v  a2  s .c o  m
public void highlight(URI property) {
    Formatter formatter = new SimpleHTMLFormatter(SearchFields.HIGHLIGHTER_PRE_TAG,
            SearchFields.HIGHLIGHTER_POST_TAG);
    highlighter = new Highlighter(formatter, new QueryScorer(query));
}

From source file:org.eclipse.skalli.core.search.LuceneIndex.java

License:Open Source License

public synchronized SearchResult<T> moreLikeThis(T entity, String[] fields, int count) {
    long start = System.nanoTime();
    SearchResult<T> moreLikeThis = new SearchResult<T>();
    List<SearchHit<T>> searchHits = new LinkedList<SearchHit<T>>();
    PagingInfo pagingInfo = new PagingInfo(0, 0);
    int totalHitCount = 0;
    if (initialized) {
        IndexReader reader = null;/*from  w ww  . ja v a  2s  . c  om*/
        IndexSearcher searcher = null;
        try {
            reader = IndexReader.open(directory);
            searcher = new IndexSearcher(reader);
            ScoreDoc baseDoc = getDocByUUID(searcher, entity.getUuid());
            if (baseDoc != null) {
                MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
                mlt.setFieldNames(fields);
                mlt.setMinWordLen(2);
                mlt.setBoost(true);
                mlt.setMinDocFreq(0);
                mlt.setMinTermFreq(0);
                mlt.setAnalyzer(analyzer);
                Query query = mlt.like(baseDoc.doc);
                int numHits = Math.min(count + 1, entityService.size()); // count + 1: baseDoc will be one of the hits
                TopScoreDocCollector collector = TopScoreDocCollector.create(numHits, false);
                searcher.search(query, collector);

                List<String> fieldList = Arrays.asList(fields);
                Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
                for (ScoreDoc hit : collector.topDocs().scoreDocs) {
                    if (hit.doc != baseDoc.doc) {
                        Document doc = searcher.doc(hit.doc);
                        SearchHit<T> searchHit = getSearchHit(doc, fieldList, hit.score, highlighter);
                        searchHits.add(searchHit);
                    }
                }
                pagingInfo = new PagingInfo(0, count);
                totalHitCount = collector.getTotalHits() - 1;
            }
        } catch (Exception e) {
            LOG.error(
                    MessageFormat.format("Searching for entities similiar to ''{0}'' failed", entity.getUuid()),
                    e);
        } finally {
            closeQuietly(searcher);
            closeQuietly(reader);
        }
    }

    long nanoDuration = System.nanoTime() - start;
    long milliDuration = Math.round(nanoDuration / 1000000d);
    moreLikeThis.setPagingInfo(pagingInfo);
    moreLikeThis.setResultCount(totalHitCount);
    moreLikeThis.setResult(searchHits);
    moreLikeThis.setDuration(milliDuration);

    moreLikeThis.setResult(searchHits);
    return moreLikeThis;
}

From source file:org.eclipse.skalli.core.search.LuceneIndex.java

License:Open Source License

private <R extends SearchResult<T>> R search(final String[] fields, String facetFields[],
        final String queryString, PagingInfo pagingInfo, R ret) throws QueryParseException {
    long start = System.nanoTime();
    List<SearchHit<T>> resultList = new LinkedList<SearchHit<T>>();
    int totalHitCount = 0;
    if (pagingInfo == null) {
        pagingInfo = new PagingInfo(0, 10);
    }// w w w. j  a  v  a2  s . c o m
    if (StringUtils.equals("*", queryString) || StringUtils.isEmpty(queryString)) { //$NON-NLS-1$
        List<T> allEntities = entityService.getAll();
        List<T> sublist = allEntities.subList(Math.min(pagingInfo.getStart(), allEntities.size()),
                Math.min(pagingInfo.getStart() + pagingInfo.getCount(), allEntities.size()));
        resultList.addAll(entitiesToHit(sublist));
        totalHitCount = allEntities.size();
    } else if (initialized) {
        List<String> fieldList = Arrays.asList(fields);
        IndexReader reader = null;
        IndexSearcher searcher = null;
        try {
            reader = IndexReader.open(directory);
            searcher = new IndexSearcher(reader);
            QueryParser parser = new MultiFieldQueryParser(LUCENE_VERSION, fields, analyzer);
            Query query = getQuery(parser, queryString);

            // it is not possible that we have more hits than projects!
            int maxHits = entityService.size();
            int numHits = pagingInfo.getStart() + pagingInfo.getCount();
            if (numHits < 0 || numHits > maxHits) {
                numHits = maxHits;
            }
            if (numHits > 0) {
                TopDocsCollector<ScoreDoc> collector;
                if (facetFields == null) {
                    collector = TopScoreDocCollector.create(numHits, false);
                } else {
                    collector = new FacetedCollector(facetFields, searcher.getIndexReader(), numHits);
                }

                searcher.search(query, collector);
                Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
                TopDocs topDocs = collector.topDocs(pagingInfo.getStart(), pagingInfo.getCount());
                for (ScoreDoc hit : topDocs.scoreDocs) {
                    Document doc = searcher.doc(hit.doc);
                    SearchHit<T> searchHit = getSearchHit(doc, fieldList, hit.score, highlighter);
                    resultList.add(searchHit);
                }

                totalHitCount = collector.getTotalHits();
                if (collector instanceof FacetedCollector && ret instanceof FacetedSearchResult) {
                    ((FacetedSearchResult<T>) ret).setFacetInfo(((FacetedCollector) collector).getFacetsMap());
                }
            }
        } catch (Exception e) {
            LOG.error(MessageFormat.format("Searching with query ''{0}'' failed", queryString), e);
        } finally {
            closeQuietly(searcher);
            closeQuietly(reader);
        }
    }

    long nanoDuration = System.nanoTime() - start;
    long milliDuration = Math.round(nanoDuration / 1000000d);
    ret.setPagingInfo(pagingInfo);
    ret.setQueryString(queryString);
    ret.setResultCount(totalHitCount);
    ret.setResult(resultList);
    ret.setDuration(milliDuration);
    return ret;
}

From source file:org.eclipse.smila.search.lucene.index.IndexConnection.java

License:Open Source License

/**
 * {@inheritDoc}/*from w w w . ja  v a2s  .c  o m*/
 */
@Override
protected void addHighlightAnnotation(final IQueryExpression dQE, final String recordId, final AnyMap highlight,
        final int fieldNo, final String attributeName, final String indexName) throws IndexException {
    final DIndexStructure dIS = _index.getIndexStructure();
    final Document doc = _hits.get(recordId);

    final DIndexField field = (DIndexField) dIS.getField(fieldNo);
    if (field == null) {
        throw new IndexException("invalid field no in result [" + fieldNo + "]");
    }

    final String text = doc.get(field.getName());
    if (text != null) {
        final AnnotationFormatter formatter = new AnnotationFormatter(highlight.getFactory());
        formatter.reset(text);
        try {
            final Query hlQuery = getHighlightQuery(fieldNo, dQE);
            if (hlQuery != null) {
                final TokenStream tokenStream = _analyzer.tokenStream(attributeName, new StringReader(text));
                final Highlighter highlighter = new Highlighter(formatter, new QueryScorer(hlQuery));
                // this triggers the execution of the Formatter
                highlighter.getBestTextFragments(tokenStream, text, MERGE_CONTIGUOSE_FRAGMENTS,
                        MAX_NUMBER_FRAGMENTS);

                final AnyMap attributeHighlight = highlight.getFactory().createAnyMap();
                final AnySeq highlightingPositions = formatter.getHighlightPositions();
                attributeHighlight.put(SearchResultConstants.HIGHLIGHT_POSITIONS, highlightingPositions);
                attributeHighlight.put(SearchResultConstants.HIGHLIGHT_TEXT,
                        highlight.getFactory().createStringValue(text));
                highlight.put(attributeName, attributeHighlight);
            }
        } catch (final Exception ex) {
            throw new IndexException("error getting result value for record with id " + recordId, ex);
        }
    }
}

From source file:org.haplo.app.SearchResultExcerptHighlighter.java

License:Mozilla Public License

static public String[] bestHighlightedExcerpts(String escapedText, String searchTerms, int maxExcerptLength) {
    try {/*from  ww w . j  a v  a 2  s  . c  o m*/
        // Scorer selects the terms which need highlighting. Created from a 'query' based on the extracted search terms.
        Scorer scorer;
        Fragmenter fragmenter;
        if (searchTerms != null && searchTerms.length() > 0) {
            QueryParser queryParser = new QueryParser("FIELD", new StandardAnalyzer());
            Query query = queryParser.parse(searchTerms);
            scorer = new QueryScorer(query);
            fragmenter = new SimpleSpanFragmenter((QueryScorer) scorer, maxExcerptLength);
        } else {
            scorer = new NoHighlightingScorer();
            fragmenter = new SimpleFragmenter(maxExcerptLength);
        }

        // Parse the escaped text into tokens, which retain the positions in the text
        StandardAnalyzer analyser = new StandardAnalyzer();
        TokenStream tokenStream = analyser.tokenStream("FIELD", new StringReader(escapedText));

        // Finally, do the highlighting!
        Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"), scorer);
        highlighter.setTextFragmenter(fragmenter);
        return highlighter.getBestFragments(tokenStream, escapedText, NUMBER_OF_FRAGMENTS);
    } catch (Exception e) {
        Logger.getLogger("org.haplo.app").info("Exception in SearchResultExcerptHighlighter: ", e);
        return null;
    }
}