List of usage examples for org.apache.lucene.search.highlight Highlighter Highlighter
public Highlighter(Formatter formatter, Scorer fragmentScorer)
From source file:Main.WebAPI.Search.java
/** * /*from w w w .jav a2 s .c o m*/ * @param args args[0] is a query * * @throws IOException * @throws ParseException * @throws InvalidTokenOffsetsException */ public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException { //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); String querystr = args.length > 0 ? args[0] : "mike lab"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer) .parse(querystr); // 3. search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs hits = searcher.search(query, 10); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); String Preview; for (int i = 0; i < 10; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String text; Preview = ""; System.out.println(doc.get("url")); System.out.println(doc.get("title")); text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); int k = 0; for (TextFragment frag1 : frag) { if ((frag1 != null) && (frag1.getScore() > 0)) { Preview += (frag1.toString()) + "...<br>"; k++; // Get 2 Line Preview if (k >= 2) break; } } //Term vector System.out.println("-------------"); } }
From source file:net.riezebos.thoth.content.search.Searcher.java
License:Apache License
public PagedList<SearchResult> search(Identity identity, String queryExpression, int pageNumber, int pageSize) throws SearchException { try {/*from w w w . j a va 2 s. c om*/ IndexReader reader = getIndexReader(contentManager); IndexSearcher searcher = getIndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); // We might need to restrict the results to books of the user does not have access to fragments: AccessManager accessManager = contentManager.getAccessManager(); boolean booksOnly = !accessManager.hasPermission(identity, "", Permission.READ_FRAGMENTS); if (booksOnly) { queryExpression = Indexer.INDEX_TYPE + ":" + Indexer.TYPE_DOCUMENT + " AND (" + queryExpression + ")"; } QueryParser parser = new QueryParser(Indexer.INDEX_CONTENTS, analyzer); Query query = parser.parse(queryExpression); // We add 1 to determine if there is more to be found after the current page int maxResults = pageSize * pageNumber + 1; TopDocs results = searcher.search(query, maxResults, Sort.RELEVANCE); ScoreDoc[] hits = results.scoreDocs; boolean hadMore = (hits.length == maxResults); List<SearchResult> searchResults = new ArrayList<>(); int idx = 0; for (ScoreDoc scoreDoc : hits) { if (searchResults.size() == pageSize) break; idx++; if (idx >= (pageNumber - 1) * pageSize) { Document document = searcher.doc(scoreDoc.doc); IndexableField field = document.getField(Indexer.INDEX_PATH); String documentPath = field.stringValue(); SearchResult searchResult = new SearchResult(); searchResult.setIndexNumber((pageNumber - 1) * pageSize + idx); searchResult.setDocument(documentPath); String type = document.get(Indexer.INDEX_TYPE); if (Indexer.TYPE_DOCUMENT.equals(type) || Indexer.TYPE_FRAGMENT.equals(type)) { searchResult.setResource(false); try { MarkDownDocument markDownDocument = contentManager.getMarkDownDocument(documentPath, true, CriticProcessingMode.DO_NOTHING); String contents = markDownDocument.getMarkdown(); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query, Indexer.INDEX_CONTENTS)); highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); TokenStream tokenStream = analyzer.tokenStream(Indexer.INDEX_CONTENTS, contents); TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, contents, false, 99999); for (TextFragment frag : frags) { if ((frag != null) && (frag.getScore() > 0)) { String fragmentText = frag.toString(); searchResult.addFragment( new Fragment(ThothCoreUtil.escapeHtmlExcept("B", fragmentText))); } } } catch (FileNotFoundException e) { LOG.warn( "Index contains an invalid file reference); probably need to reindex to get rid of this. File: " + e.getMessage()); } } else { searchResult.setResource(true); String extension = ThothUtil.getExtension(documentPath); searchResult.setImage(getConfiguration().isImageExtension(extension)); searchResult.addFragment(new Fragment(document.get(Indexer.INDEX_TITLE))); } searchResults.add(searchResult); } } reader.close(); linkBooks(searchResults); PagedList<SearchResult> pagedList = new PagedList<>(searchResults, hadMore); return pagedList; } catch (Exception e) { throw new SearchException(e); } }
From source file:net.sf.mmm.search.engine.impl.lucene.LuceneSearchHighlighter.java
License:Apache License
/** * The constructor.//from w ww. j a v a 2 s .co m * * @param searchAnalyzer is the {@link Analyzer} used by the {@link LuceneSearchEngine search-engine}. * @param formatter is the formatter used to highlight terms. * @param searchQuery is the {@link Query} of the search. Matching terms of this query shall be highlighted. */ public LuceneSearchHighlighter(Analyzer searchAnalyzer, Formatter formatter, Query searchQuery) { super(); this.highlighter = new Highlighter(formatter, new QueryScorer(searchQuery)); this.analyzer = searchAnalyzer; }
From source file:net.sf.zekr.engine.search.lucene.QuranTextSearcher.java
/** * Main search method, for internal use. * //www .ja va 2s . c o m * @param q query string * @return a list of highlighted string objects. * @throws SearchException */ private List<SearchResultItem> internalSearch(String q) throws SearchException { IndexSearcher is = null; try { is = new IndexSearcher(zekrIndexReader.indexReader); // analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); // resultTokenStream = new StandardTokenizer(Version.LUCENE_CURRENT, reader); QueryParser parser = QueryParserFactory.create(Version.LUCENE_CURRENT, QuranTextIndexer.CONTENTS_FIELD, analyzer); // allow search terms like "*foo" with leading star parser.setAllowLeadingWildcard(true); // parser.setFuzzyPrefixLength(10); // if this line is not set, highlighter doesn't work in in wildcard queries while query.rewrite() is done. // and sorting also doesn't work correctly for wildcard queries. parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); logger.debug("Parse query."); query = parser.parse(q); BooleanQuery.setMaxClauseCount(maxClauseCount); logger.debug("Rewrite query."); query = query.rewrite(zekrIndexReader.indexReader); // required to expand search terms logger.debug("Searching for: " + query.toString()); // Hits hits; TopFieldDocs tops = null; is.setDefaultFieldSortScoring(true, true); if (searchScope != null && searchScope.getScopeItems().size() > 0) { String scopeQuery = makeSearchScope(); logger.debug("Scope is: " + scopeQuery); // hits = is.search(query, new QuranRangeFilter(searchScope), sortResultOrder); tops = is.search(query, new QuranRangeFilter(searchScope), maxSearchResult, sortResultOrder); } else { // hits = is.search(query, new QueryWrapperFilter(query), 20, sortResultOrder); tops = is.search(query, new QueryWrapperFilter(query), maxSearchResult, sortResultOrder); } logger.debug("Highlight search result."); Highlighter highlighter = new Highlighter(highlightFormatter, new QueryScorer(query)); // highlighter.setFragmentScorer(new QueryTermScorer(query)); int total = Math.min(maxSearchResult, tops.totalHits); List<SearchResultItem> res = new ArrayList<SearchResultItem>(total); for (int i = 0; i < total; i++) { ScoreDoc[] sd = tops.scoreDocs; Document doc = is.doc(sd[i].doc); final String contents = doc.get(QuranTextIndexer.CONTENTS_FIELD); final IQuranLocation location = new QuranLocation(doc.get(QuranTextIndexer.LOCATION_FIELD)); TokenStream tokenStream = analyzer.tokenStream(QuranTextIndexer.CONTENTS_FIELD, new StringReader(contents)); // String resultStr = highlighter.getBestFragment(tokenStream, contents); String resultStr = highlighter.getBestFragments(tokenStream, contents, 100, "..."); SearchResultItem sri = new SearchResultItem(resultStr, location); res.add(sri); } matchedItemCount = highlightFormatter.getHighlightCount(); // highlightedTermList = highlightFormatter.getHighlightedTermList(); return res; } catch (Exception e) { throw new SearchException(e); } finally { if (is != null) { try { is.close(); } catch (IOException e) { } } } }
From source file:net.skyatlas.icd.test.AnsegTest.java
private String toHighlighter(Analyzer analyzer, Query query, Document doc) throws InvalidTokenOffsetsException { String field = "text"; try {/*from w w w.j a v a 2s. co m*/ SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field))); String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field)); return highlighterStr == null ? doc.get(field) : highlighterStr; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; }
From source file:net.sourceforge.docfetcher.model.search.HighlightService.java
License:Open Source License
@MutableCopy @NotNull//from w ww. j a v a2 s .c o m private static List<Range> highlight(@NotNull Query query, @NotNull String text) throws CheckedOutOfMemoryError { final List<Range> ranges = new ArrayList<Range>(); /* * A formatter is supposed to return formatted text, but since we're * only interested in the start and end offsets of the search terms, we * return null and store the offsets in a list. */ Formatter nullFormatter = new Formatter() { public String highlightTerm(String originalText, TokenGroup tokenGroup) { for (int i = 0; i < tokenGroup.getNumTokens(); i++) { Token token = tokenGroup.getToken(i); if (tokenGroup.getScore(i) == 0) continue; int start = token.startOffset(); int end = token.endOffset(); ranges.add(new Range(start, end - start)); } return null; } }; String key = Fields.CONTENT.key(); Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key)); highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); highlighter.setTextFragmenter(new NullFragmenter()); try { /* * This has a return value, but we ignore it since we only want the * offsets. Might throw an OutOfMemoryError. */ highlighter.getBestFragment(IndexRegistry.getAnalyzer(), key, text); } catch (OutOfMemoryError e) { throw new CheckedOutOfMemoryError(e); } catch (Exception e) { Util.printErr(e); } return ranges; }
From source file:net.sourceforge.docfetcher.view.PreviewPanel.java
License:Open Source License
/** * Sets the file to be displayed, using <tt>parser</tt> to extract the * text from the file on the disk. This method does nothing if the given * file is null. The <tt>force</tt> parameter specifies whether the * preview should be updated even if neither the file nor the search terms * have changed in the meantime.//from ww w.ja va2s .c o m */ private void setFile(final File file, final Parser parser, final Query query, boolean force) { File lastFile = this.file; Query lastQuery = this.query; this.file = file; this.parser = parser; this.query = query; // Check input if (file == null) return; if (parser == null) // Allowed to be null if file is null, too throw new IllegalArgumentException(); if (!isActive) return; if (file.equals(lastFile) && !force) if (query != null && query.equals(lastQuery)) return; if (file.isDirectory()) throw new IllegalStateException("File expected for preview, got directory instead."); //$NON-NLS-1$ if (!file.exists()) { textViewer.setText(Msg.file_not_found.value()); showViewer(textViewerContainer); return; } // Use the HTML browser if (file.getAbsolutePath().equals(Const.HELP_FILE) || Pref.Bool.PreviewHTML.getValue()) { final BrowserPanel browser = browserProvider.getBrowser(previewPanel, browserToolBar, parser); if (browser != null) { browser.addProgressListener(new ProgressAdapter() { public void completed(ProgressEvent event) { showViewer(browser); upBt.setEnabled(false); downBt.setEnabled(false); occurrenceCounter.setText("0"); //$NON-NLS-1$ } }); browser.setFile(file); return; } // Browser creation failed, go on to next code block } // Use text renderers showViewer(textViewerContainer); // Use monospace font for text files if (parser instanceof TextParser) { org.eclipse.swt.graphics.Font monoFont = Font.PREVIEW_MONO.getFont(); if (!textViewer.getFont().equals(monoFont)) textViewer.setFont(monoFont); } else { org.eclipse.swt.graphics.Font previewFont = Font.PREVIEW.getFont(); if (!textViewer.getFont().equals(previewFont)) textViewer.setFont(previewFont); } textViewer.setText(Msg.loading.value()); // display loading message new Thread() { // run in a thread because parsing the file takes some time public void run() { // Extract the raw text from the file String text; boolean fileParsed = true; try { text = parser.renderText(file); } catch (ParseException e) { text = Msg.cant_read_file.format(e.getMessage()); fileParsed = false; } catch (OutOfMemoryError e) { /* * We can get here if the user sets a high java heap space * value during indexing and then sets a lower value for * search only usage. */ text = Msg.out_of_jvm_memory.value(); fileParsed = false; } if (PreviewPanel.this.file != file) return; // Another preview request had been started while we were parsing /* * Create the message that will be displayed if the character limit * is reached. It is appended to the file contents later; if it * was appended here, some words in it might get highlighted. */ int maxLength = Pref.Int.PreviewLimit.getValue(); final String msg = "...\n\n\n[" //$NON-NLS-1$ + Msg.preview_limit_hint.format(new Object[] { maxLength, Pref.Int.PreviewLimit.name(), Const.USER_PROPERTIES_FILENAME }) + "]"; //$NON-NLS-1$ final boolean exceeded = text.length() > maxLength; if (text.length() > maxLength) text = text.substring(0, maxLength - msg.length()); final String fText = text; /* * Create StyleRange ranges (i.e. start-end integer pairs) for * search term highlighting. Only tokenize preview text if we're * not displaying any info messages and if there are tokens to * highlight. */ ranges = new int[0]; if (fileParsed && query != null) { final List<Integer> rangesList = new ArrayList<Integer>(); Analyzer analyzer = RootScope.analyzer; /* * A formatter is supposed to return formatted text, but * since we're only interested in the start and end offsets * of the search terms, we return null and store the offsets * in a list. */ Formatter nullFormatter = new Formatter() { public String highlightTerm(String originalText, TokenGroup tokenGroup) { for (int i = 0; i < tokenGroup.getNumTokens(); i++) { Token token = tokenGroup.getToken(i); if (tokenGroup.getScore(i) == 0) continue; int start = token.startOffset(); int end = token.endOffset(); rangesList.add(start); rangesList.add(end - start); } return null; } }; Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, Document.contents)); highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); highlighter.setTextFragmenter(new NullFragmenter()); try { /* * This has a return value, but we ignore it since we * only want the offsets. */ highlighter.getBestFragment(analyzer, Document.contents, fText); } catch (Exception e) { // We can do without the search term highlighting } // List to array (will be used by the method 'setHighlighting(..)') ranges = new int[rangesList.size()]; for (int i = 0; i < ranges.length; i++) ranges[i] = rangesList.get(i); } // Parsing and tokenizing done; display the results final boolean fFileParsed = fileParsed; Display.getDefault().syncExec(new Runnable() { public void run() { // Enable or disable up and down buttons upBt.setEnabled(ranges.length != 0); downBt.setEnabled(ranges.length != 0); textViewer.setText(fText); setHighlighting(fFileParsed && Pref.Bool.HighlightSearchTerms.getValue()); occurrenceCounter.setText(Integer.toString(ranges.length / 2)); if (exceeded) textViewer.append(msg); // character limit exceeded, append hint } }); } }.start(); }
From source file:net.sourceforge.vaticanfetcher.model.search.HighlightService.java
License:Open Source License
@MutableCopy @NotNull/*from www.j a va2 s . c o m*/ private static List<Range> highlight(@NotNull Query query, @NotNull String text) throws CheckedOutOfMemoryError { final List<Range> ranges = new ArrayList<Range>(); /* * A formatter is supposed to return formatted text, but since we're * only interested in the start and end offsets of the search terms, we * return null and store the offsets in a list. */ Formatter nullFormatter = new Formatter() { public String highlightTerm(String originalText, TokenGroup tokenGroup) { for (int i = 0; i < tokenGroup.getNumTokens(); i++) { Token token = tokenGroup.getToken(i); if (tokenGroup.getScore(i) == 0) continue; int start = token.startOffset(); int end = token.endOffset(); ranges.add(new Range(start, end - start)); } return null; } }; String key = Fields.CONTENT.key(); Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key)); highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); highlighter.setTextFragmenter(new NullFragmenter()); try { /* * This has a return value, but we ignore it since we only want the * offsets. Might throw an OutOfMemoryError. */ highlighter.getBestFragment(IndexRegistry.analyzer, key, text); } catch (OutOfMemoryError e) { throw new CheckedOutOfMemoryError(e); } catch (Exception e) { Util.printErr(e); } return ranges; }
From source file:org.apache.blur.utils.HighlightHelper.java
License:Apache License
/** * NOTE: This method will not preserve the correct field types. * //from w w w .j a v a 2s .c o m * @param preTag * @param postTag */ public static Document highlight(int docId, Document document, Query query, FieldManager fieldManager, IndexReader reader, String preTag, String postTag) throws IOException, InvalidTokenOffsetsException { String fieldLessFieldName = fieldManager.getFieldLessFieldName(); Query fixedQuery = fixSuperQuery(query, null, fieldLessFieldName); Analyzer analyzer = fieldManager.getAnalyzerForQuery(); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(preTag, postTag); Document result = new Document(); for (IndexableField f : document) { String name = f.name(); if (fieldLessFieldName.equals(name) || FIELDS_NOT_TO_HIGHLIGHT.contains(name)) { result.add(f); continue; } String text = f.stringValue(); Number numericValue = f.numericValue(); Query fieldFixedQuery; if (fieldManager.isFieldLessIndexed(name)) { fieldFixedQuery = fixSuperQuery(query, name, fieldLessFieldName); } else { fieldFixedQuery = fixedQuery; } if (numericValue != null) { if (shouldNumberBeHighlighted(name, numericValue, fieldFixedQuery)) { String numberHighlight = preTag + text + postTag; result.add(new StringField(name, numberHighlight, Store.YES)); } } else { Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(fieldFixedQuery, name)); TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, docId, name, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10); for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { result.add(new StringField(name, frag[j].toString(), Store.YES)); } } } } return result; }
From source file:org.apache.jena.query.text.TextIndexLucene.java
License:Apache License
private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, String field, String highlight) throws IOException, InvalidTokenOffsetsException { List<TextHit> results = new ArrayList<>(); HighlightOpts opts = new HighlightOpts(highlight); SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(opts.start, opts.end); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(opts.fragSize)); for (ScoreDoc sd : sDocs) { Document doc = indexSearcher.doc(sd.doc); log.trace("highlightResults[{}]: {}", sd.doc, doc); String entity = doc.get(docDef.getEntityField()); Node literal = null;//from w w w .j a v a2 s .c o m String lexical = doc.get(field); if (lexical != null) { String docLang = doc.get(docDef.getLangField()); TokenStream tokenStream = analyzer.tokenStream(field, lexical); TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags, opts.maxFrags); String rez = frags2string(frags, opts); literal = NodeFactory.createLiteral(rez, docLang); } String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null; Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null; Node subject = TextQueryFuncs.stringToNode(entity); TextHit hit = new TextHit(subject, sd.score, literal, graph); results.add(hit); } return results; }