public final String[] getBestFragments(TokenStream tokenStream, String text, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException 

Highlights chosen terms in a text, extracting the most relevant sections.


From source file:ca.uhn.fhir.jpa.dao.FhirSearchDao.java

License:Apache License

public List<Suggestion> suggestKeywords(String theContext, String theSearchParam, String theText) {
    Validate.notBlank(theContext, "theContext must be provided");
    Validate.notBlank(theSearchParam, "theSearchParam must be provided");
    Validate.notBlank(theText, "theSearchParam must be provided");

    long start = System.currentTimeMillis();

    String[] contextParts = StringUtils.split(theContext, '/');
    if (contextParts.length != 3 || "Patient".equals(contextParts[0]) == false
            || "$everything".equals(contextParts[2]) == false) {
        throw new InvalidRequestException("Invalid context: " + theContext);
    }// w  w  w.  j a  va 2s  .  com
    IdDt contextId = new IdDt(contextParts[0], contextParts[1]);
    Long pid = BaseHapiFhirDao.translateForcedIdToPid(contextId, myEntityManager);

    FullTextEntityManager em = org.hibernate.search.jpa.Search.getFullTextEntityManager(myEntityManager);

    QueryBuilder qb = em.getSearchFactory().buildQueryBuilder().forEntity(ResourceTable.class).get();

    Query textQuery = qb.phrase().withSlop(2).onField("myContentText").boostedTo(4.0f)

    Query query = qb.bool()

    FullTextQuery ftq = em.createFullTextQuery(query, ResourceTable.class);

    List<?> resultList = ftq.getResultList();
    List<Suggestion> suggestions = Lists.newArrayList();
    for (Object next : resultList) {
        Object[] nextAsArray = (Object[]) next;
        String nextValue = (String) nextAsArray[0];

        try {
            MySuggestionFormatter formatter = new MySuggestionFormatter(theText, suggestions);
            Scorer scorer = new QueryScorer(textQuery);
            Highlighter highlighter = new Highlighter(formatter, scorer);
            Analyzer analyzer = em.getSearchFactory().getAnalyzer(ResourceTable.class);

            highlighter.getBestFragments(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue,

            highlighter.getBestFragments(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue, 10);

            highlighter.getBestFragments(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue,

            // formatter.setAnalyzer("myContentText");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentText", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextNGram");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextEdgeNGram");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextPhonetic");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue, 10);
        } catch (Exception e) {
            throw new InternalErrorException(e);



    Set<String> terms = Sets.newHashSet();
    for (Iterator<Suggestion> iter = suggestions.iterator(); iter.hasNext();) {
        String nextTerm = iter.next().getTerm().toLowerCase();
        if (!terms.add(nextTerm)) {

    long delay = System.currentTimeMillis() - start;
    ourLog.info("Provided {} suggestions for term {} in {} ms", new Object[] { terms.size(), theText, delay });

    return suggestions;

From source file:com.bluedragon.search.search.QueryRun.java

License:Open Source License

private void addRow(IndexSearcher searcher, int docid, float score, int rank, int searchCount,
        int recordsSearched) throws CorruptIndexException, Exception {
    DocumentWrap document = new DocumentWrap(searcher.doc(docid));

    queryResultData.addRow(1);//from  w w  w .  j  av a 2s  .  com

    // Add in the standard columns that we know we have for every search
    queryResultData.setCell(1, new cfStringData(document.getId()));
    queryResultData.setCell(2, new cfStringData(document.getName()));
    queryResultData.setCell(3, new cfNumberData(score));
    queryResultData.setCell(4, new cfNumberData(searchCount));
    queryResultData.setCell(5, new cfNumberData(recordsSearched));
    queryResultData.setCell(6, new cfNumberData(rank + 1));

    String uC = queryAttributes.getUniqueColumn();

    // Now we do the custom ones
    List<IndexableField> fields = document.getDocument().getFields();
    Iterator<IndexableField> it = fields.iterator();
    while (it.hasNext()) {
        IndexableField fieldable = it.next();

        String fieldName = fieldable.name().toLowerCase();

        // Check for the unique
        if (uniqueSet != null && fieldName.equals(uC)) {
            if (uniqueSet.contains(fieldable.stringValue())) {
            } else

        // Check to see if we have this column
        if (fieldName.equals("contents") && !queryAttributes.getContentFlag())

        if (!activeColumns.containsKey(fieldName)) {
            int newcolumn = queryResultData.addColumnData(fieldable.name().toUpperCase(),
                    cfArrayData.createArray(1), null);
            activeColumns.put(fieldName, newcolumn);

        int column = activeColumns.get(fieldName);
        if (column <= 6)

        queryResultData.setCell(column, new cfStringData(fieldable.stringValue()));

    // Do the context stuff if enable
    if (queryAttributes.getContextPassages() > 0) {

        Scorer scorer = new QueryScorer(queryAttributes.getQuery());
        SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(queryAttributes.getContextHighlightStart(),
        Highlighter highlighter = new Highlighter(formatter, scorer);
        Fragmenter fragmenter = new SimpleFragmenter(queryAttributes.getContextBytes());

        String nextContext = "";
        String contents = document.getAttribute(DocumentWrap.CONTENTS);

        if (contents != null) {
            TokenStream tokenStream = AnalyzerFactory.get("simple").tokenStream(DocumentWrap.CONTENTS,
                    new StringReader(contents));
            String[] fragments = null;
            try {
                fragments = highlighter.getBestFragments(tokenStream, contents,
                if (fragments.length == 1) {
                    nextContext = fragments[0] + "...";
                } else {
                    StringBuilder context = new StringBuilder();
                    for (int f = 0; f < fragments.length; f++) {
                    nextContext = context.toString();
            } catch (Exception e) {

            // Add in the context
            if (!activeColumns.containsKey("context")) {
                int newcolumn = queryResultData.addColumnData("CONTEXT", cfArrayData.createArray(1), null);
                activeColumns.put("context", newcolumn);

            queryResultData.setCell(activeColumns.get("context"), new cfStringData(nextContext));

From source file:com.difference.historybook.index.lucene.LuceneIndex.java

License:Apache License

public SearchResultWrapper search(String collection, String query, int offset, int size, boolean includeDebug)
        throws IndexException {
    try {//from w ww .  j  av  a 2  s. co  m
        //TODO: make age be a component in the ranking?
        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
        queryBuilder.add(parser.parse(query), Occur.MUST);
        queryBuilder.add(new TermQuery(new Term(IndexDocumentAdapter.FIELD_COLLECTION, collection)),
        Query baseQuery = queryBuilder.build();

        FunctionQuery boostQuery = new FunctionQuery(
                new ReciprocalFloatFunction(new DurationValueSource(new Date().getTime() / 1000,
                        new LongFieldSource(IndexDocumentAdapter.FIELD_TIMESTAMP)), RECIP, 1F, 1F));

        Query q = new CustomScoreQuery(baseQuery, boostQuery);

        QueryScorer queryScorer = new QueryScorer(q, IndexDocumentAdapter.FIELD_SEARCH);
        Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        Highlighter highlighter = new Highlighter(queryScorer);

        GroupingSearch gsearch = new GroupingSearch(IndexDocumentAdapter.FIELD_URL_GROUP).setGroupDocsLimit(1)
        TopGroups<?> groups = gsearch.search(searcher, q, offset, size);

        ArrayList<SearchResult> results = new ArrayList<>(size);
        for (int i = offset; i < offset + size && i < groups.groups.length; i++) {
            ScoreDoc scoreDoc = groups.groups[i].scoreDocs[0];
            Document luceneDoc = searcher.doc(scoreDoc.doc);
            IndexDocumentAdapter doc = new IndexDocumentAdapter(luceneDoc);

            TokenStream tokenStream = TokenSources.getTokenStream(IndexDocumentAdapter.FIELD_SEARCH,
                    reader.getTermVectors(scoreDoc.doc), luceneDoc.get(IndexDocumentAdapter.FIELD_SEARCH),
                    analyzer, highlighter.getMaxDocCharsToAnalyze() - 1);

            String[] snippets = highlighter.getBestFragments(tokenStream,
                    luceneDoc.get(IndexDocumentAdapter.FIELD_SEARCH), 3);
            String snippet = Arrays.asList(snippets).stream().collect(Collectors.joining("\n"));
            snippet = Jsoup.clean(snippet, Whitelist.simpleText());

            String debugInfo = null;
            if (includeDebug) {
                Explanation explanation = searcher.explain(q, scoreDoc.doc);
                debugInfo = explanation.toString();

            results.add(new SearchResult(doc.getKey(), doc.getCollection(), doc.getTitle(), doc.getUrl(),
                    doc.getDomain(), doc.getTimestampText(), snippet, debugInfo, scoreDoc.score));

        SearchResultWrapper wrapper = new SearchResultWrapper().setQuery(query).setOffset(offset)
                .setResultCount(groups.totalGroupCount != null ? groups.totalGroupCount : 0)

        if (includeDebug) {

        return wrapper;

    } catch (IOException | ParseException | InvalidTokenOffsetsException e) {
        throw new IndexException(e);

From source file:com.ecyrd.jspwiki.search.LuceneSearchProvider.java

License:Apache License

 *  Searches pages using a particular combination of flags.
 *  @param query The query to perform in Lucene query language
 *  @param flags A set of flags/*  ww w.  j  a v  a  2  s .  c  o  m*/
 *  @return A Collection of SearchResult instances
 *  @throws ProviderException if there is a problem with the backend
public Collection findPages(String query, int flags) throws ProviderException {
    Searcher searcher = null;
    ArrayList<SearchResult> list = null;
    Highlighter highlighter = null;

    try {
        QueryParser qp = new MultiFieldQueryParser(queryfields, getLuceneAnalyzer());

        //QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
        Query luceneQuery = qp.parse(query);

        if ((flags & FLAG_CONTEXTS) != 0) {
            highlighter = new Highlighter(new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"),
                    new SimpleHTMLEncoder(), new QueryScorer(luceneQuery));

        try {
            searcher = new IndexSearcher(m_luceneDirectory);
        } catch (Exception ex) {
            log.info("Lucene not yet ready; indexing not started", ex);
            return null;

        Hits hits = searcher.search(luceneQuery);

        list = new ArrayList<SearchResult>(hits.length());
        for (int curr = 0; curr < hits.length(); curr++) {
            Document doc = hits.doc(curr);
            String pageName = doc.get(LUCENE_ID);
            WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);

            if (page != null) {
                if (page instanceof Attachment) {
                    // Currently attachments don't look nice on the search-results page
                    // When the search-results are cleaned up this can be enabled again.

                int score = (int) (hits.score(curr) * 100);

                // Get highlighted search contexts
                String text = doc.get(LUCENE_PAGE_CONTENTS);

                String[] fragments = new String[0];
                if (text != null && highlighter != null) {
                    TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS,
                            new StringReader(text));
                    fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);


                SearchResult result = new SearchResultImpl(page, score, fragments);
            } else {
                log.error("Lucene found a result page '" + pageName
                        + "' that could not be loaded, removing from Lucene cache");
                pageRemoved(new WikiPage(m_engine, pageName));
    } catch (IOException e) {
        log.error("Failed during lucene search", e);
    } catch (InstantiationException e) {
        log.error("Unable to get a Lucene analyzer", e);
    } catch (IllegalAccessException e) {
        log.error("Unable to get a Lucene analyzer", e);
    } catch (ClassNotFoundException e) {
        log.error("Specified Lucene analyzer does not exist", e);
    } catch (ParseException e) {
        log.info("Broken query; cannot parse", e);

        throw new ProviderException("You have entered a query Lucene cannot process: " + e.getMessage());
    } finally {
        if (searcher != null) {
            try {
            } catch (IOException e) {

    return list;

From source file:com.gauronit.tagmata.core.Indexer.java

License:Open Source License

public ArrayList<CardSnapshot> search(String searchText, ArrayList<String> indexNames, boolean searchInTitle,
        boolean searchInTags, boolean searchInText, boolean superFuzzy) {
    ArrayList<CardSnapshot> cardSnaps = new ArrayList();
    try {//from  ww w.ja  v a  2  s  . c  om
        ArrayList<IndexSearcher> searchers = new ArrayList<IndexSearcher>();

        for (String indexName : indexNames) {
            IndexReader reader = IndexReader
                    .open(FSDirectory.open(new File(indexDir + File.separator + indexName),
                            new SimpleFSLockFactory(indexDir + File.separator + indexName)));
            IndexSearcher searcher = new IndexSearcher(reader);

        BooleanQuery query = new BooleanQuery();
        if (searchInTitle) {
            IndexerUtil.getTokenizedQuery(query, "title", searchText, superFuzzy);
        if (searchInTags) {
            IndexerUtil.getTokenizedQuery(query, "tags", searchText, superFuzzy);
        if (searchInText) {
            IndexerUtil.getTokenizedQuery(query, "text", searchText, superFuzzy);
            IndexerUtil.getTokenizedQuery(query, "analyzedText", searchText, superFuzzy);

        for (IndexSearcher searcher : searchers) {
            TopScoreDocCollector collector = TopScoreDocCollector.create(10000, false);
            searcher.search(query, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            for (ScoreDoc hit : hits) {
                Document doc = searcher.doc(hit.doc);

                TokenStream stream = TokenSources.getTokenStream("text", doc.get("analyzedText"),
                        new StandardAnalyzer(Version.LUCENE_20.LUCENE_35));
                QueryScorer scorer = new QueryScorer(query, "analyzedText");
                Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 20);
                Highlighter highlighter = new Highlighter(scorer);
                String[] fragments = highlighter.getBestFragments(stream, doc.get("text"), 5);
                String highlights = "";

                for (String fragment : fragments) {
                    highlights += fragment + "...";

                if (highlights.equals("")) {
                    String text = doc.get("text");
                    if (text.length() > 100) {
                        highlights += doc.get("text").substring(0, 100);
                    } else {
                        highlights += doc.get("text");

                cardSnaps.add(new CardSnapshot(highlights, doc));
            searcher = null;

    } catch (Exception ex) {
    return cardSnaps;

From source file:com.main.Searcher.java

public List<Bean> searching(String s1, String s2, String radioBtn)
        throws IOException, ParseException, InvalidTokenOffsetsException {
    //getting reference of directory
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));

    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words, takes out the stop words
    Analyzer analyzer = new StandardAnalyzer();

    String contents = "contents";

    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }/* ww w. j av a  2 s  . c om*/

    Query q1 = parser.parse(s1);
    Query q2 = parser.parse(s2);

    //conjuction, disjunction and negation
    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    //occur.must : both queries required in a doc
    if (radioBtn.equals("conjunction")) {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST);
    } //occur.should: one of the q1 should be presen t in doc
    else if (radioBtn.equals("disjunction")) {
        bq.add(q1, BooleanClause.Occur.SHOULD);
        bq.add(q2, BooleanClause.Occur.SHOULD);
    } //negation: first should present , second should not
    else {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST_NOT);

    TopDocs hits = searcher.search(bq.build(), 10);

    Formatter formatter = new SimpleHTMLFormatter();

    QueryScorer scorer = new QueryScorer(bq.build());

    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter

    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();

        int outResult = hits.scoreDocs.length;
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        Document doc = searcher.doc(docid);

        String name = doc.get("name");
        String title = doc.get("title");

        String path = doc.get("path");

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {



    // }
    return beanList;

From source file:com.main.Searcher.java

public List<Bean> searching(String s1) throws IOException, ParseException, InvalidTokenOffsetsException {
    //Get directory reference
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));
    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);
    //CreateIndexReader reader = DirectoryReader.open(dir); lucene searcher. It search over a single IndexReader.
    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words
    Analyzer analyzer = new StandardAnalyzer();
    //Query parser to be used for creating TermQuery

    String queries = null;/*from  ww w  .  ja va  2 s.  c  o  m*/
    String queryString = null; //regular search
    String contents = "contents";
    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);


    Query q1 = parser.parse(s1);

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    bq.add(q1, BooleanClause.Occur.MUST);
    //Search the lucene documents
    TopDocs hits = searcher.search(bq.build(), 10);
    // TopScoreDocCollector collector = TopScoreDocCollector.create(5);
     * Highlighter Code Start ***
    //Uses HTML &lt;B&gt;&lt;/B&gt; tag to highlight the searched terms
    Formatter formatter = new SimpleHTMLFormatter();
    //It scores cont fragments by the number of unique q1 terms found
    //Basically the matching score in layman terms
    QueryScorer scorer = new QueryScorer(bq.build());
    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    //Iterate over found results
    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();
        //int rank = hits.scoreDocs.length;
        int outResult = hits.scoreDocs.length;
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        Document doc = searcher.doc(docid);
        // String title = doc.get("title");
        String name = doc.get("name");
        String title = doc.get("title");

        String path = doc.get("path");

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {



    // }
    return beanList;

From source file:com.oneis.app.SearchResultExcerptHighlighter.java

License:Mozilla Public License

static public String[] bestHighlightedExcerpts(String escapedText, String searchTerms, int maxExcerptLength) {
    try {/*from w ww .j  a va  2s  .  c  o m*/
        // Scorer selects the terms which need highlighting. Created from a 'query' based on the extracted search terms.
        Scorer scorer;
        Fragmenter fragmenter;
        if (searchTerms != null && searchTerms.length() > 0) {
            QueryParser queryParser = new QueryParser("FIELD", new StandardAnalyzer());
            Query query = queryParser.parse(searchTerms);
            scorer = new QueryScorer(query);
            fragmenter = new SimpleSpanFragmenter((QueryScorer) scorer, maxExcerptLength);
        } else {
            scorer = new NoHighlightingScorer();
            fragmenter = new SimpleFragmenter(maxExcerptLength);

        // Parse the escaped text into tokens, which retain the positions in the text
        StandardAnalyzer analyser = new StandardAnalyzer();
        TokenStream tokenStream = analyser.tokenStream("FIELD", new StringReader(escapedText));

        // Finally, do the highlighting!
        Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"), scorer);
        return highlighter.getBestFragments(tokenStream, escapedText, NUMBER_OF_FRAGMENTS);
    } catch (Exception e) {
        Logger.getLogger("com.oneis.app").info("Exception in SearchResultExcerptHighlighter: ", e);
        return null;

From source file:com.pongasoft.kiwidoc.index.impl.keyword.impl.KeywordIndexImpl.java

License:Apache License

 * Highlights the provided results obtained using the provided query.
 * @param query  the query from which the results were computed
 * @param models the models to highlight
 * @return a map representing for each entry in the model its associated resource and highlight
 * @throws MalformedQueryException if the query cannot be parsed
 * @throws InternalException if there is an internal problem
 *//*from  ww  w .jav  a 2s .  c o  m*/
public <R extends Resource> Map<R, String[]> highlightResults(KeywordQuery query, Collection<Model<R>> models)
        throws InternalException, MalformedQueryException {
    Map<R, String[]> res = new LinkedHashMap<R, String[]>();

    Query parsedQuery = parseQuery(query);

    if (parsedQuery != null) {
        Highlighter highlighter = new Highlighter(_highlighterFormatter, HTML_ENCODER,
                new QueryScorer(parsedQuery));

        for (Model<R> model : models) {
            Document document = new Document();
            String bodyText = buildBody(model);
            document.add(new Field(DocumentFactory.BODY_FIELD, bodyText, Field.Store.NO, Field.Index.ANALYZED));
            TokenStream tokenStream = TokenSources.getTokenStream(document, DocumentFactory.BODY_FIELD,
            try {
                res.put(model.getResource(), highlighter.getBestFragments(tokenStream, bodyText, 2));
            } catch (IOException e) {
                log.warn("exception while computing highlight... [ignored]", e);

    return res;