List of usage examples for org.apache.lucene.search.highlight Highlighter getBestFragments
public final String getBestFragments(TokenStream tokenStream, String text, int maxNumFragments, String separator) throws IOException, InvalidTokenOffsetsException
From source file:it.eng.spagobi.commons.utilities.indexing.LuceneSearcher.java
License:Mozilla Public License
public static HashMap<String, Object> searchIndexFuzzy(IndexSearcher searcher, String queryString, String index, String[] fields, String metaDataToSearch) throws IOException, ParseException { logger.debug("IN"); HashMap<String, Object> objectsToReturn = new HashMap<String, Object>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); BooleanQuery orQuery = new BooleanQuery(); BooleanQuery andQuery = new BooleanQuery(); for (int i = 0; i < fields.length; i++) { Query query = new FuzzyQuery(new Term(fields[i], queryString)); query = query.rewrite(searcher.getIndexReader()); orQuery.add(query, BooleanClause.Occur.SHOULD); }//from w w w .j av a 2 s . co m andQuery.add(orQuery, BooleanClause.Occur.MUST); if (metaDataToSearch != null) { //search for query string on metadata name field and content //where metadata name = metaDataToSearch Query queryMetadata = new TermQuery(new Term(IndexingConstants.METADATA, metaDataToSearch)); andQuery.add(queryMetadata, BooleanClause.Occur.MUST); } Query tenantQuery = new TermQuery(new Term(IndexingConstants.TENANT, getTenant())); andQuery.add(tenantQuery, BooleanClause.Occur.MUST); logger.debug("Searching for: " + andQuery.toString()); int hitsPerPage = 50; // Collect enough docs to show 5 pages TopScoreDocCollector collector = TopScoreDocCollector.create(5 * hitsPerPage, false); searcher.search(andQuery, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; objectsToReturn.put("hits", hits); //highlighter //orQuery = orQuery.rewrite(searcher.getIndexReader()); //andQuery = andQuery.rewrite(searcher.getIndexReader()); Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(andQuery)); if (hits != null) { for (int i = 0; i < hits.length; i++) { ScoreDoc hit = hits[i]; Document doc = searcher.doc(hit.doc); String biobjId = doc.get(IndexingConstants.BIOBJ_ID); String summary = " "; if (highlighter != null) { String[] summaries; try { Integer idobj = (Integer.valueOf(biobjId)); String contentToSearchOn = fillSummaryText(idobj); summaries = highlighter.getBestFragments(new StandardAnalyzer(Version.LUCENE_CURRENT), IndexingConstants.CONTENTS, contentToSearchOn, 3); StringBuffer summaryBuffer = new StringBuffer(); if (summaries.length > 0) { summaryBuffer.append(summaries[0]); } for (int j = 1; j < summaries.length; j++) { summaryBuffer.append(" ... "); summaryBuffer.append(summaries[j]); } summary = summaryBuffer.toString(); //get only a portion of summary if (summary.length() > 101) { summary = summary.substring(0, 100); summary += "..."; } objectsToReturn.put(biobjId, summary); } catch (InvalidTokenOffsetsException e) { logger.error(e.getMessage(), e); } catch (Exception e) { logger.error(e.getMessage(), e); } } } } int numTotalHits = collector.getTotalHits(); logger.info(numTotalHits + " total matching documents"); logger.debug("OUT"); return objectsToReturn; }
From source file:lius.search.LiusHitList.java
License:Apache License
private LiusHit buildLiusHit(int index) throws IOException { LiusHit liusHit = new LiusHit(); liusHit.setScore(luceneHits.score(index)); liusHit.setDocId(luceneHits.id(index)); Document luceneDocument = luceneHits.doc(index); Map liusHitFieldsMap = new HashMap(); List liusFieldsList = new ArrayList(); Highlighter luceneHighlighter = null; if (liusConfig.getHighlighter() == true) { IndexReader luceneIndexReader = IndexReader.open(indexDirectory); Query rewrittenLuceneQuery = luceneQuery.rewrite(luceneIndexReader); QueryScorer luceneScorer = new QueryScorer(rewrittenLuceneQuery); SimpleHTMLFormatter luceneFormatter = new SimpleHTMLFormatter("<span class=\"liusHit\">", "</span>"); luceneHighlighter = new Highlighter(luceneFormatter, luceneScorer); }/*w ww. ja v a2 s. c o m*/ for (int j = 0; j < liusConfig.getDisplayFields().size(); j++) { LiusField configLiusField = (LiusField) liusConfig.getDisplayFields().get(j); LiusField hitLiusField = new LiusField(); String fieldName = configLiusField.getName(); hitLiusField.setName(fieldName); hitLiusField.setLabel(configLiusField.getLabel()); if (luceneHighlighter != null) { Fragmenter luceneFragmenter; if (configLiusField.getFragmenter() != null) { luceneFragmenter = new SimpleFragmenter(Integer.parseInt(configLiusField.getFragmenter())); } else { luceneFragmenter = new SimpleFragmenter(Integer.MAX_VALUE); } luceneHighlighter.setTextFragmenter(luceneFragmenter); } String[] luceneDocumentValues = luceneDocument.getValues(configLiusField.getName()); if (luceneDocumentValues != null) { if (luceneHighlighter != null) { for (int k = 0; k < luceneDocumentValues.length; k++) { Analyzer luceneAnalyzer = AnalyzerFactory.getAnalyzer(liusConfig); TokenStream luceneTokenStream = luceneAnalyzer.tokenStream(configLiusField.getName(), new StringReader(luceneDocumentValues[k])); String fragment = null; if (configLiusField.getFragmenter() != null) fragment = luceneHighlighter.getBestFragments(luceneTokenStream, luceneDocumentValues[k], 5, "..."); else { fragment = luceneHighlighter.getBestFragment(luceneTokenStream, luceneDocumentValues[k]); } if (fragment == null) { } else { luceneDocumentValues[k] = fragment; } } } hitLiusField.setValue(luceneDocumentValues[0]); hitLiusField.setValues(luceneDocumentValues); liusHitFieldsMap.put(configLiusField.getName(), hitLiusField); liusFieldsList.add(hitLiusField); } } liusHit.setLiusFieldsMap(liusHitFieldsMap); liusHit.setLiusFields(liusFieldsList); return liusHit; }
From source file:net.hillsdon.reviki.search.impl.LuceneSearcher.java
License:Apache License
private LinkedHashSet<SearchMatch> doQuery(final IndexReader reader, final Analyzer analyzer, final Searcher searcher, final String field, final boolean provideExtracts, final Query query) throws IOException, CorruptIndexException { Highlighter highlighter = null; if (provideExtracts) { highlighter = new Highlighter(new SimpleHTMLFormatter("<strong>", "</strong>"), new SimpleHTMLEncoder(), new QueryScorer(query)); }//from w ww .j a va 2 s . c om Hits hits = searcher.search(query); LinkedHashSet<SearchMatch> results = new LinkedHashSet<SearchMatch>(); @SuppressWarnings("unchecked") Iterator<Hit> iter = hits.iterator(); while (iter.hasNext()) { Hit hit = iter.next(); String text = hit.get(field); String extract = null; // The text is not stored for all fields, just provide a null extract. if (highlighter != null && text != null) { TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(text)); // Get 3 best fragments and separate with a "..." extract = highlighter.getBestFragments(tokenStream, text, 3, "..."); } results.add(new SearchMatch(_wikiName.equals(hit.get(FIELD_WIKI)), hit.get(FIELD_WIKI), hit.get(FIELD_PATH), extract)); } return results; }
From source file:net.sf.zekr.engine.search.lucene.QuranTextSearcher.java
/** * Main search method, for internal use. * //from ww w .ja v a2s .c om * @param q query string * @return a list of highlighted string objects. * @throws SearchException */ private List<SearchResultItem> internalSearch(String q) throws SearchException { IndexSearcher is = null; try { is = new IndexSearcher(zekrIndexReader.indexReader); // analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); // resultTokenStream = new StandardTokenizer(Version.LUCENE_CURRENT, reader); QueryParser parser = QueryParserFactory.create(Version.LUCENE_CURRENT, QuranTextIndexer.CONTENTS_FIELD, analyzer); // allow search terms like "*foo" with leading star parser.setAllowLeadingWildcard(true); // parser.setFuzzyPrefixLength(10); // if this line is not set, highlighter doesn't work in in wildcard queries while query.rewrite() is done. // and sorting also doesn't work correctly for wildcard queries. parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); logger.debug("Parse query."); query = parser.parse(q); BooleanQuery.setMaxClauseCount(maxClauseCount); logger.debug("Rewrite query."); query = query.rewrite(zekrIndexReader.indexReader); // required to expand search terms logger.debug("Searching for: " + query.toString()); // Hits hits; TopFieldDocs tops = null; is.setDefaultFieldSortScoring(true, true); if (searchScope != null && searchScope.getScopeItems().size() > 0) { String scopeQuery = makeSearchScope(); logger.debug("Scope is: " + scopeQuery); // hits = is.search(query, new QuranRangeFilter(searchScope), sortResultOrder); tops = is.search(query, new QuranRangeFilter(searchScope), maxSearchResult, sortResultOrder); } else { // hits = is.search(query, new QueryWrapperFilter(query), 20, sortResultOrder); tops = is.search(query, new QueryWrapperFilter(query), maxSearchResult, sortResultOrder); } logger.debug("Highlight search result."); Highlighter highlighter = new Highlighter(highlightFormatter, new QueryScorer(query)); // highlighter.setFragmentScorer(new QueryTermScorer(query)); int total = Math.min(maxSearchResult, tops.totalHits); List<SearchResultItem> res = new ArrayList<SearchResultItem>(total); for (int i = 0; i < total; i++) { ScoreDoc[] sd = tops.scoreDocs; Document doc = is.doc(sd[i].doc); final String contents = doc.get(QuranTextIndexer.CONTENTS_FIELD); final IQuranLocation location = new QuranLocation(doc.get(QuranTextIndexer.LOCATION_FIELD)); TokenStream tokenStream = analyzer.tokenStream(QuranTextIndexer.CONTENTS_FIELD, new StringReader(contents)); // String resultStr = highlighter.getBestFragment(tokenStream, contents); String resultStr = highlighter.getBestFragments(tokenStream, contents, 100, "..."); SearchResultItem sri = new SearchResultItem(resultStr, location); res.add(sri); } matchedItemCount = highlightFormatter.getHighlightCount(); // highlightedTermList = highlightFormatter.getHighlightedTermList(); return res; } catch (Exception e) { throw new SearchException(e); } finally { if (is != null) { try { is.close(); } catch (IOException e) { } } } }
From source file:org.archive.tnh.servlet.OpenSearchServlet.java
License:Apache License
public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { try {// w ww . ja v a 2s . c o m long responseTime = System.nanoTime(); QueryParameters p = (QueryParameters) request.getAttribute(OpenSearchHelper.PARAMS_KEY); if (p == null) { p = getQueryParameters(request); } BooleanQuery q = this.translator.translate(p.query, this.foldAccents); this.translator.addFilterGroup(q, "site", p.sites); this.translator.addFilterGroup(q, "type", p.types); this.translator.addFilterGroup(q, "collection", p.collections); this.translator.addFilterGroup(q, "date", p.dates); long parseQueryTime = System.nanoTime(); if (Arrays.equals(p.indexNames, QueryParameters.ALL_INDEXES)) { if (p.excludes.length > 0) { // If there are indexes to exclude, exclude them. p.indexNames = removeExcludes(p.excludes); } } else { // There are explicitly named indexes. Weed out any unknown names. p.indexNames = removeUnknownIndexNames(p.indexNames); } Search.Result result; if (p.indexNames.length == 0) { result = new Search.Result(); result.hits = new Hit[0]; } else { result = this.searcher.search(p.indexNames, q, p.start + (p.hitsPerPage * 3), p.hitsPerSite); } long executeQueryTime = System.nanoTime(); // The 'end' is usually just the end of the current page // (start+hitsPerPage); but if we are on the last page // of de-duped results, then the end is hits.getLength(). int end = Math.min(result.hits.length, p.start + p.hitsPerPage); // The length is usually just (end-start), unless the start // position is past the end of the results -- which is common when // de-duping. The user could easily jump past the true end of the // de-dup'd results. If the start is past the end, we use a // length of '0' to produce an empty results page. int length = Math.max(end - p.start, 0); // Usually, the total results is the total number of non-de-duped // results. Howerver, if we are on last page of de-duped results, // then we know our de-dup'd total is result.hits.length. long totalResults = result.hits.length < (p.start + p.hitsPerPage) ? result.hits.length : result.numRawHits; Document doc = new Document(); Element channel = OpenSearchHelper.startResponse(doc, p, request, totalResults); // Add hits to XML Document for (int i = p.start; i < end; i++) { org.apache.lucene.document.Document hit = result.searcher.doc(result.hits[i].id); Element item = JDOMHelper.add(channel, "item"); // Replace & and < with their XML entity counterparts to // ensure that any HTML markup in the snippet is escaped // before we do the highlighting. String title = hit.get("title"); if (title != null) { title = title.replaceAll("[&]", "&"); title = title.replaceAll("[<]", "<"); } JDOMHelper.add(item, "title", title); JDOMHelper.add(item, "link", hit.get("url")); JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "docId", String.valueOf(result.hits[i].id)); JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "score", String.valueOf(result.hits[i].score)); JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "site", result.hits[i].site); JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "length", hit.get("length")); JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "type", hit.get("type")); JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "boost", hit.get("boost")); JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "collection", hit.get("collection")); String indexName = this.searcher.resolveIndexName(result.searcher, result.hits[i].id); JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "index", indexName); for (String date : hit.getValues("date")) { JDOMHelper.add(item, "date", date); } String raw = getContent(hit); StringBuilder buf = new StringBuilder(100); Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), new NonBrokenHTMLEncoder(), new QueryScorer(q, "content")); CustomAnalyzer analyzer = new CustomAnalyzer(); analyzer.setFoldAccents(this.foldAccents); for (String snippet : highlighter.getBestFragments(analyzer, "content", raw, this.contextSnippetsPerResult)) { buf.append(snippet); buf.append("..."); } JDOMHelper.add(item, "description", buf.toString()); // Last, but not least, add a hit explanation, if enabled if (explain) { JDOMHelper.add(item, OpenSearchHelper.NS_ARCHIVE, "explain", result.searcher.explain(q, result.hits[i].id).toHtml()); } } OpenSearchHelper.addResponseTime(channel, System.nanoTime() - responseTime); long buildResultsTime = System.nanoTime(); OpenSearchHelper.writeResponse(doc, response, "application/rss+xml"); long writeResponseTime = System.nanoTime(); LOG.info("S: " + ((parseQueryTime - responseTime) / 1000 / 1000) + " " + ((executeQueryTime - parseQueryTime) / 1000 / 1000) + " " + ((buildResultsTime - executeQueryTime) / 1000 / 1000) + " " + ((writeResponseTime - buildResultsTime) / 1000 / 1000) + " " + p.query); } catch (Exception e) { throw new ServletException(e); } }
From source file:org.compass.core.lucene.engine.LuceneSearchEngineHighlighter.java
License:Apache License
public String fragmentsWithSeparator(Resource resource, String propertyName, String text) throws SearchEngineException { Highlighter highlighter = createHighlighter(propertyName); TokenStream tokenStream = createTokenStream(resource, propertyName, text); try {// w ww .jav a2 s .c o m String actualSeparator = getActualSeparator(); return highlighter.getBestFragments(tokenStream, text, getMaxNumFragments(), actualSeparator); } catch (IOException e) { throw new SearchEngineException("Failed to highlight fragments for alias [" + resource.getAlias() + "] and property [" + propertyName + "]"); } }
From source file:org.eclipse.rdf4j.sail.lucene.LuceneIndex.java
License:Open Source License
public synchronized String getSnippet(String fieldName, String text, Highlighter highlighter) { String snippet;//from ww w . j a v a 2 s .c o m try { TokenStream tokenStream = getAnalyzer().tokenStream(fieldName, new StringReader(text)); snippet = highlighter.getBestFragments(tokenStream, text, 2, "..."); } catch (Exception e) { logger.error("Exception while getting snippet for field " + fieldName, e); snippet = null; } return snippet; }
From source file:org.eclipse.skalli.core.search.LuceneIndex.java
License:Open Source License
private String doHighlight(final Highlighter highlighter, final List<String> fields, final String fieldName, String fieldContents) throws IOException { String highlighted = fieldContents; if (fieldContents != null && fields.contains(fieldName)) { try {//w ww . j av a2s .co m String[] fragments = highlighter.getBestFragments(analyzer, fieldName, fieldContents, NUMBER_BEST_FRAGMENTS); if (fragments != null && fragments.length > 0) { highlighted = LuceneUtil.withEllipsis(fragments, fieldContents); } } catch (Exception e) { LOG.error(MessageFormat.format("Failed to highlight search result ''{0}''", fieldContents), e); } } return highlighted; }
From source file:org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighterTests.java
License:Apache License
public void testHighlightPhrase() throws Exception { Query query = new PhraseQuery.Builder().add(new Term("field", "foo")).add(new Term("field", "bar")).build(); QueryScorer queryScorer = new CustomQueryScorer(query); org.apache.lucene.search.highlight.Highlighter highlighter = new org.apache.lucene.search.highlight.Highlighter( queryScorer);// w w w. jav a 2 s .com String[] frags = highlighter.getBestFragments(new MockAnalyzer(random()), "field", "bar foo bar foo", 10); assertArrayEquals(new String[] { "bar <B>foo</B> <B>bar</B> foo" }, frags); }
From source file:org.jamwiki.search.LuceneSearchEngine.java
License:LGPL
/** * *//*from w w w.ja va 2s.c o m*/ private String retrieveResultSummary(Document document, Highlighter highlighter, StandardAnalyzer analyzer) throws Exception { String content = document.get(ITYPE_CONTENT_PLAIN); TokenStream tokenStream = analyzer.tokenStream(ITYPE_CONTENT_PLAIN, new StringReader(content)); String summary = highlighter.getBestFragments(tokenStream, content, 3, "..."); if (StringUtils.isBlank(summary) && !StringUtils.isBlank(content)) { summary = StringEscapeUtils.escapeHtml(content.substring(0, Math.min(200, content.length()))); if (Math.min(200, content.length()) == 200) { summary += "..."; } } return summary; }