Example usage for org.apache.lucene.search.highlight Highlighter getBestFragments

List of usage examples for org.apache.lucene.search.highlight Highlighter getBestFragments

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter getBestFragments.

Prototype

public final String getBestFragments(TokenStream tokenStream, String text, int maxNumFragments,
        String separator) throws IOException, InvalidTokenOffsetsException 

Source Link

Document

Highlights terms in the text , extracting the most relevant sections and concatenating the chosen fragments with a separator (typically "...").

Usage

From source file:it.eng.spagobi.commons.utilities.indexing.LuceneSearcher.java

License:Mozilla Public License

public static HashMap<String, Object> searchIndexFuzzy(IndexSearcher searcher, String queryString, String index,
        String[] fields, String metaDataToSearch) throws IOException, ParseException {
    logger.debug("IN");
    HashMap<String, Object> objectsToReturn = new HashMap<String, Object>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    BooleanQuery orQuery = new BooleanQuery();
    BooleanQuery andQuery = new BooleanQuery();
    for (int i = 0; i < fields.length; i++) {
        Query query = new FuzzyQuery(new Term(fields[i], queryString));
        query = query.rewrite(searcher.getIndexReader());
        orQuery.add(query, BooleanClause.Occur.SHOULD);
    }//from  w  w w  .j av  a 2 s  . co  m
    andQuery.add(orQuery, BooleanClause.Occur.MUST);
    if (metaDataToSearch != null) {
        //search for query string on metadata name field and content
        //where metadata name = metaDataToSearch
        Query queryMetadata = new TermQuery(new Term(IndexingConstants.METADATA, metaDataToSearch));
        andQuery.add(queryMetadata, BooleanClause.Occur.MUST);
    }

    Query tenantQuery = new TermQuery(new Term(IndexingConstants.TENANT, getTenant()));
    andQuery.add(tenantQuery, BooleanClause.Occur.MUST);

    logger.debug("Searching for: " + andQuery.toString());
    int hitsPerPage = 50;

    // Collect enough docs to show 5 pages
    TopScoreDocCollector collector = TopScoreDocCollector.create(5 * hitsPerPage, false);
    searcher.search(andQuery, collector);
    ScoreDoc[] hits = collector.topDocs().scoreDocs;
    objectsToReturn.put("hits", hits);

    //highlighter
    //orQuery = orQuery.rewrite(searcher.getIndexReader());
    //andQuery = andQuery.rewrite(searcher.getIndexReader());
    Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(andQuery));

    if (hits != null) {
        for (int i = 0; i < hits.length; i++) {
            ScoreDoc hit = hits[i];
            Document doc = searcher.doc(hit.doc);
            String biobjId = doc.get(IndexingConstants.BIOBJ_ID);
            String summary = " ";
            if (highlighter != null) {
                String[] summaries;
                try {
                    Integer idobj = (Integer.valueOf(biobjId));

                    String contentToSearchOn = fillSummaryText(idobj);
                    summaries = highlighter.getBestFragments(new StandardAnalyzer(Version.LUCENE_CURRENT),
                            IndexingConstants.CONTENTS, contentToSearchOn, 3);

                    StringBuffer summaryBuffer = new StringBuffer();
                    if (summaries.length > 0) {
                        summaryBuffer.append(summaries[0]);
                    }
                    for (int j = 1; j < summaries.length; j++) {
                        summaryBuffer.append(" ... ");
                        summaryBuffer.append(summaries[j]);
                    }
                    summary = summaryBuffer.toString();
                    //get only a portion of summary
                    if (summary.length() > 101) {
                        summary = summary.substring(0, 100);
                        summary += "...";
                    }
                    objectsToReturn.put(biobjId, summary);
                } catch (InvalidTokenOffsetsException e) {
                    logger.error(e.getMessage(), e);
                } catch (Exception e) {
                    logger.error(e.getMessage(), e);
                }

            }
        }
    }

    int numTotalHits = collector.getTotalHits();
    logger.info(numTotalHits + " total matching documents");

    logger.debug("OUT");
    return objectsToReturn;

}

From source file:lius.search.LiusHitList.java

License:Apache License

private LiusHit buildLiusHit(int index) throws IOException {

    LiusHit liusHit = new LiusHit();
    liusHit.setScore(luceneHits.score(index));
    liusHit.setDocId(luceneHits.id(index));

    Document luceneDocument = luceneHits.doc(index);

    Map liusHitFieldsMap = new HashMap();
    List liusFieldsList = new ArrayList();
    Highlighter luceneHighlighter = null;

    if (liusConfig.getHighlighter() == true) {
        IndexReader luceneIndexReader = IndexReader.open(indexDirectory);

        Query rewrittenLuceneQuery = luceneQuery.rewrite(luceneIndexReader);
        QueryScorer luceneScorer = new QueryScorer(rewrittenLuceneQuery);

        SimpleHTMLFormatter luceneFormatter = new SimpleHTMLFormatter("<span class=\"liusHit\">", "</span>");
        luceneHighlighter = new Highlighter(luceneFormatter, luceneScorer);
    }/*w  ww. ja v  a2 s. c o m*/

    for (int j = 0; j < liusConfig.getDisplayFields().size(); j++) {
        LiusField configLiusField = (LiusField) liusConfig.getDisplayFields().get(j);
        LiusField hitLiusField = new LiusField();
        String fieldName = configLiusField.getName();

        hitLiusField.setName(fieldName);
        hitLiusField.setLabel(configLiusField.getLabel());

        if (luceneHighlighter != null) {
            Fragmenter luceneFragmenter;
            if (configLiusField.getFragmenter() != null) {
                luceneFragmenter = new SimpleFragmenter(Integer.parseInt(configLiusField.getFragmenter()));
            } else {
                luceneFragmenter = new SimpleFragmenter(Integer.MAX_VALUE);
            }
            luceneHighlighter.setTextFragmenter(luceneFragmenter);
        }
        String[] luceneDocumentValues = luceneDocument.getValues(configLiusField.getName());
        if (luceneDocumentValues != null) {
            if (luceneHighlighter != null) {
                for (int k = 0; k < luceneDocumentValues.length; k++) {
                    Analyzer luceneAnalyzer = AnalyzerFactory.getAnalyzer(liusConfig);
                    TokenStream luceneTokenStream = luceneAnalyzer.tokenStream(configLiusField.getName(),
                            new StringReader(luceneDocumentValues[k]));
                    String fragment = null;
                    if (configLiusField.getFragmenter() != null)
                        fragment = luceneHighlighter.getBestFragments(luceneTokenStream,
                                luceneDocumentValues[k], 5, "...");
                    else {
                        fragment = luceneHighlighter.getBestFragment(luceneTokenStream,
                                luceneDocumentValues[k]);
                    }

                    if (fragment == null) {
                    } else {
                        luceneDocumentValues[k] = fragment;
                    }
                }
            }

            hitLiusField.setValue(luceneDocumentValues[0]);
            hitLiusField.setValues(luceneDocumentValues);

            liusHitFieldsMap.put(configLiusField.getName(), hitLiusField);
            liusFieldsList.add(hitLiusField);
        }

    }
    liusHit.setLiusFieldsMap(liusHitFieldsMap);
    liusHit.setLiusFields(liusFieldsList);
    return liusHit;
}

From source file:net.hillsdon.reviki.search.impl.LuceneSearcher.java

License:Apache License

private LinkedHashSet<SearchMatch> doQuery(final IndexReader reader, final Analyzer analyzer,
        final Searcher searcher, final String field, final boolean provideExtracts, final Query query)
        throws IOException, CorruptIndexException {
    Highlighter highlighter = null;
    if (provideExtracts) {
        highlighter = new Highlighter(new SimpleHTMLFormatter("<strong>", "</strong>"), new SimpleHTMLEncoder(),
                new QueryScorer(query));
    }//from  w  ww .j  a va  2 s  . c  om
    Hits hits = searcher.search(query);
    LinkedHashSet<SearchMatch> results = new LinkedHashSet<SearchMatch>();
    @SuppressWarnings("unchecked")
    Iterator<Hit> iter = hits.iterator();
    while (iter.hasNext()) {
        Hit hit = iter.next();
        String text = hit.get(field);
        String extract = null;
        // The text is not stored for all fields, just provide a null extract.
        if (highlighter != null && text != null) {
            TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(text));
            // Get 3 best fragments and separate with a "..."
            extract = highlighter.getBestFragments(tokenStream, text, 3, "...");
        }
        results.add(new SearchMatch(_wikiName.equals(hit.get(FIELD_WIKI)), hit.get(FIELD_WIKI),
                hit.get(FIELD_PATH), extract));
    }
    return results;
}

From source file:net.sf.zekr.engine.search.lucene.QuranTextSearcher.java

/**
 * Main search method, for internal use.
 * //from  ww w .ja  v  a2s  .c  om
 * @param q query string
 * @return a list of highlighted string objects.
 * @throws SearchException
 */
private List<SearchResultItem> internalSearch(String q) throws SearchException {
    IndexSearcher is = null;
    try {
        is = new IndexSearcher(zekrIndexReader.indexReader);

        // analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        // resultTokenStream = new StandardTokenizer(Version.LUCENE_CURRENT, reader);

        QueryParser parser = QueryParserFactory.create(Version.LUCENE_CURRENT, QuranTextIndexer.CONTENTS_FIELD,
                analyzer);

        // allow search terms like "*foo" with leading star
        parser.setAllowLeadingWildcard(true);
        // parser.setFuzzyPrefixLength(10);

        // if this line is not set, highlighter doesn't work in in wildcard queries while query.rewrite() is done.
        // and sorting also doesn't work correctly for wildcard queries.
        parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);

        logger.debug("Parse query.");
        query = parser.parse(q);
        BooleanQuery.setMaxClauseCount(maxClauseCount);

        logger.debug("Rewrite query.");
        query = query.rewrite(zekrIndexReader.indexReader); // required to expand search terms

        logger.debug("Searching for: " + query.toString());
        // Hits hits;
        TopFieldDocs tops = null;
        is.setDefaultFieldSortScoring(true, true);
        if (searchScope != null && searchScope.getScopeItems().size() > 0) {
            String scopeQuery = makeSearchScope();
            logger.debug("Scope is: " + scopeQuery);
            // hits = is.search(query, new QuranRangeFilter(searchScope), sortResultOrder);
            tops = is.search(query, new QuranRangeFilter(searchScope), maxSearchResult, sortResultOrder);

        } else {
            // hits = is.search(query, new QueryWrapperFilter(query), 20, sortResultOrder);
            tops = is.search(query, new QueryWrapperFilter(query), maxSearchResult, sortResultOrder);
        }

        logger.debug("Highlight search result.");
        Highlighter highlighter = new Highlighter(highlightFormatter, new QueryScorer(query));
        // highlighter.setFragmentScorer(new QueryTermScorer(query));

        int total = Math.min(maxSearchResult, tops.totalHits);
        List<SearchResultItem> res = new ArrayList<SearchResultItem>(total);
        for (int i = 0; i < total; i++) {
            ScoreDoc[] sd = tops.scoreDocs;
            Document doc = is.doc(sd[i].doc);
            final String contents = doc.get(QuranTextIndexer.CONTENTS_FIELD);
            final IQuranLocation location = new QuranLocation(doc.get(QuranTextIndexer.LOCATION_FIELD));
            TokenStream tokenStream = analyzer.tokenStream(QuranTextIndexer.CONTENTS_FIELD,
                    new StringReader(contents));

            // String resultStr = highlighter.getBestFragment(tokenStream, contents);
            String resultStr = highlighter.getBestFragments(tokenStream, contents, 100, "...");
            SearchResultItem sri = new SearchResultItem(resultStr, location);
            res.add(sri);
        }
        matchedItemCount = highlightFormatter.getHighlightCount();
        // highlightedTermList = highlightFormatter.getHighlightedTermList();
        return res;
    } catch (Exception e) {
        throw new SearchException(e);
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
            }
        }
    }
}

From source file:org.archive.tnh.servlet.OpenSearchServlet.java

License:Apache License

public void doGet(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, IOException {
    try {// w  ww  . ja  v a 2s  .  c o m
        long responseTime = System.nanoTime();

        QueryParameters p = (QueryParameters) request.getAttribute(OpenSearchHelper.PARAMS_KEY);
        if (p == null) {
            p = getQueryParameters(request);
        }

        BooleanQuery q = this.translator.translate(p.query, this.foldAccents);

        this.translator.addFilterGroup(q, "site", p.sites);
        this.translator.addFilterGroup(q, "type", p.types);
        this.translator.addFilterGroup(q, "collection", p.collections);
        this.translator.addFilterGroup(q, "date", p.dates);

        long parseQueryTime = System.nanoTime();

        if (Arrays.equals(p.indexNames, QueryParameters.ALL_INDEXES)) {
            if (p.excludes.length > 0) {
                // If there are indexes to exclude, exclude them.
                p.indexNames = removeExcludes(p.excludes);
            }
        } else {
            // There are explicitly named indexes.  Weed out any unknown names.
            p.indexNames = removeUnknownIndexNames(p.indexNames);
        }

        Search.Result result;
        if (p.indexNames.length == 0) {
            result = new Search.Result();
            result.hits = new Hit[0];
        } else {
            result = this.searcher.search(p.indexNames, q, p.start + (p.hitsPerPage * 3), p.hitsPerSite);
        }

        long executeQueryTime = System.nanoTime();

        // The 'end' is usually just the end of the current page
        // (start+hitsPerPage); but if we are on the last page
        // of de-duped results, then the end is hits.getLength().
        int end = Math.min(result.hits.length, p.start + p.hitsPerPage);

        // The length is usually just (end-start), unless the start
        // position is past the end of the results -- which is common when
        // de-duping.  The user could easily jump past the true end of the
        // de-dup'd results.  If the start is past the end, we use a
        // length of '0' to produce an empty results page.
        int length = Math.max(end - p.start, 0);

        // Usually, the total results is the total number of non-de-duped
        // results.  Howerver, if we are on last page of de-duped results,
        // then we know our de-dup'd total is result.hits.length.
        long totalResults = result.hits.length < (p.start + p.hitsPerPage) ? result.hits.length
                : result.numRawHits;

        Document doc = new Document();

        Element channel = OpenSearchHelper.startResponse(doc, p, request, totalResults);

        // Add hits to XML Document
        for (int i = p.start; i < end; i++) {
            org.apache.lucene.document.Document hit = result.searcher.doc(result.hits[i].id);

            Element item = JDOMHelper.add(channel, "item");

            // Replace & and < with their XML entity counterparts to
            // ensure that any HTML markup in the snippet is escaped
            // before we do the highlighting.
            String title = hit.get("title");
            if (title != null) {
                title = title.replaceAll("[&]", "&amp;");
                title = title.replaceAll("[<]", "&lt;");
            }
            JDOMHelper.add(item, "title", title);

            JDOMHelper.add(item, "link", hit.get("url"));
            JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "docId", String.valueOf(result.hits[i].id));
            JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "score", String.valueOf(result.hits[i].score));
            JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "site", result.hits[i].site);
            JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "length", hit.get("length"));
            JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "type", hit.get("type"));
            JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "boost", hit.get("boost"));
            JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "collection", hit.get("collection"));

            String indexName = this.searcher.resolveIndexName(result.searcher, result.hits[i].id);
            JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "index", indexName);

            for (String date : hit.getValues("date")) {
                JDOMHelper.add(item, "date", date);
            }

            String raw = getContent(hit);

            StringBuilder buf = new StringBuilder(100);

            Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new NonBrokenHTMLEncoder(),
                    new QueryScorer(q, "content"));

            CustomAnalyzer analyzer = new CustomAnalyzer();
            analyzer.setFoldAccents(this.foldAccents);

            for (String snippet : highlighter.getBestFragments(analyzer, "content", raw,
                    this.contextSnippetsPerResult)) {
                buf.append(snippet);
                buf.append("...");
            }

            JDOMHelper.add(item, "description", buf.toString());

            // Last, but not least, add a hit explanation, if enabled
            if (explain) {
                JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "explain",
                        result.searcher.explain(q, result.hits[i].id).toHtml());
            }
        }

        OpenSearchHelper.addResponseTime(channel, System.nanoTime() - responseTime);

        long buildResultsTime = System.nanoTime();

        OpenSearchHelper.writeResponse(doc, response, "application/rss+xml");

        long writeResponseTime = System.nanoTime();

        LOG.info("S: " + ((parseQueryTime - responseTime) / 1000 / 1000) + " "
                + ((executeQueryTime - parseQueryTime) / 1000 / 1000) + " "
                + ((buildResultsTime - executeQueryTime) / 1000 / 1000) + " "
                + ((writeResponseTime - buildResultsTime) / 1000 / 1000) + " " + p.query);
    } catch (Exception e) {
        throw new ServletException(e);
    }
}

From source file:org.compass.core.lucene.engine.LuceneSearchEngineHighlighter.java

License:Apache License

public String fragmentsWithSeparator(Resource resource, String propertyName, String text)
        throws SearchEngineException {
    Highlighter highlighter = createHighlighter(propertyName);
    TokenStream tokenStream = createTokenStream(resource, propertyName, text);
    try {//  w ww  .jav  a2  s  .c o m
        String actualSeparator = getActualSeparator();
        return highlighter.getBestFragments(tokenStream, text, getMaxNumFragments(), actualSeparator);
    } catch (IOException e) {
        throw new SearchEngineException("Failed to highlight fragments for alias [" + resource.getAlias()
                + "] and property [" + propertyName + "]");
    }
}

From source file:org.eclipse.rdf4j.sail.lucene.LuceneIndex.java

License:Open Source License

public synchronized String getSnippet(String fieldName, String text, Highlighter highlighter) {
    String snippet;//from ww  w .  j a  v  a  2 s  .c o m
    try {
        TokenStream tokenStream = getAnalyzer().tokenStream(fieldName, new StringReader(text));
        snippet = highlighter.getBestFragments(tokenStream, text, 2, "...");
    } catch (Exception e) {
        logger.error("Exception while getting snippet for field " + fieldName, e);
        snippet = null;
    }
    return snippet;
}

From source file:org.eclipse.skalli.core.search.LuceneIndex.java

License:Open Source License

private String doHighlight(final Highlighter highlighter, final List<String> fields, final String fieldName,
        String fieldContents) throws IOException {
    String highlighted = fieldContents;
    if (fieldContents != null && fields.contains(fieldName)) {
        try {//w  ww .  j  av a2s .co  m
            String[] fragments = highlighter.getBestFragments(analyzer, fieldName, fieldContents,
                    NUMBER_BEST_FRAGMENTS);
            if (fragments != null && fragments.length > 0) {
                highlighted = LuceneUtil.withEllipsis(fragments, fieldContents);
            }
        } catch (Exception e) {
            LOG.error(MessageFormat.format("Failed to highlight search result ''{0}''", fieldContents), e);
        }
    }
    return highlighted;
}

From source file:org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighterTests.java

License:Apache License

public void testHighlightPhrase() throws Exception {
    Query query = new PhraseQuery.Builder().add(new Term("field", "foo")).add(new Term("field", "bar")).build();
    QueryScorer queryScorer = new CustomQueryScorer(query);
    org.apache.lucene.search.highlight.Highlighter highlighter = new org.apache.lucene.search.highlight.Highlighter(
            queryScorer);//  w w w. jav  a 2  s .com
    String[] frags = highlighter.getBestFragments(new MockAnalyzer(random()), "field", "bar foo bar foo", 10);
    assertArrayEquals(new String[] { "bar <B>foo</B> <B>bar</B> foo" }, frags);
}

From source file:org.jamwiki.search.LuceneSearchEngine.java

License:LGPL

/**
 *
 *//*from   w w  w.ja  va 2s.c  o  m*/
private String retrieveResultSummary(Document document, Highlighter highlighter, StandardAnalyzer analyzer)
        throws Exception {
    String content = document.get(ITYPE_CONTENT_PLAIN);
    TokenStream tokenStream = analyzer.tokenStream(ITYPE_CONTENT_PLAIN, new StringReader(content));
    String summary = highlighter.getBestFragments(tokenStream, content, 3, "...");
    if (StringUtils.isBlank(summary) && !StringUtils.isBlank(content)) {
        summary = StringEscapeUtils.escapeHtml(content.substring(0, Math.min(200, content.length())));
        if (Math.min(200, content.length()) == 200) {
            summary += "...";
        }
    }
    return summary;
}