Example usage for org.apache.lucene.search.highlight Highlighter getBestFragments

List of usage examples for org.apache.lucene.search.highlight Highlighter getBestFragments

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter getBestFragments.

Prototype

public final String[] getBestFragments(TokenStream tokenStream, String text, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException 

Source Link

Document

Highlights chosen terms in a text, extracting the most relevant sections.

Usage

From source file:ca.uhn.fhir.jpa.dao.FhirSearchDao.java

License:Apache License

@Override
public List<Suggestion> suggestKeywords(String theContext, String theSearchParam, String theText) {
    Validate.notBlank(theContext, "theContext must be provided");
    Validate.notBlank(theSearchParam, "theSearchParam must be provided");
    Validate.notBlank(theText, "theSearchParam must be provided");

    long start = System.currentTimeMillis();

    String[] contextParts = StringUtils.split(theContext, '/');
    if (contextParts.length != 3 || "Patient".equals(contextParts[0]) == false
            || "$everything".equals(contextParts[2]) == false) {
        throw new InvalidRequestException("Invalid context: " + theContext);
    }// w  w  w.  j a  va 2s  .  com
    IdDt contextId = new IdDt(contextParts[0], contextParts[1]);
    Long pid = BaseHapiFhirDao.translateForcedIdToPid(contextId, myEntityManager);

    FullTextEntityManager em = org.hibernate.search.jpa.Search.getFullTextEntityManager(myEntityManager);

    QueryBuilder qb = em.getSearchFactory().buildQueryBuilder().forEntity(ResourceTable.class).get();

    //@formatter:off
    Query textQuery = qb.phrase().withSlop(2).onField("myContentText").boostedTo(4.0f)
            .andField("myContentTextEdgeNGram").boostedTo(2.0f).andField("myContentTextNGram").boostedTo(1.0f)
            .andField("myContentTextPhonetic").boostedTo(0.5f).sentence(theText.toLowerCase()).createQuery();

    Query query = qb.bool()
            .must(qb.keyword().onField("myResourceLinks.myTargetResourcePid").matching(pid).createQuery())
            .must(textQuery).createQuery();
    //@formatter:on

    FullTextQuery ftq = em.createFullTextQuery(query, ResourceTable.class);
    ftq.setProjection("myContentText");
    ftq.setMaxResults(20);

    List<?> resultList = ftq.getResultList();
    List<Suggestion> suggestions = Lists.newArrayList();
    for (Object next : resultList) {
        Object[] nextAsArray = (Object[]) next;
        String nextValue = (String) nextAsArray[0];

        try {
            MySuggestionFormatter formatter = new MySuggestionFormatter(theText, suggestions);
            Scorer scorer = new QueryScorer(textQuery);
            Highlighter highlighter = new Highlighter(formatter, scorer);
            Analyzer analyzer = em.getSearchFactory().getAnalyzer(ResourceTable.class);

            formatter.setAnalyzer("myContentTextPhonetic");
            highlighter.getBestFragments(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue,
                    10);

            formatter.setAnalyzer("myContentTextNGram");
            highlighter.getBestFragments(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue, 10);

            formatter.setFindPhrasesWith();
            formatter.setAnalyzer("myContentTextEdgeNGram");
            highlighter.getBestFragments(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue,
                    10);

            // formatter.setAnalyzer("myContentText");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentText", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextNGram");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextEdgeNGram");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextPhonetic");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue, 10);
        } catch (Exception e) {
            throw new InternalErrorException(e);
        }

    }

    Collections.sort(suggestions);

    Set<String> terms = Sets.newHashSet();
    for (Iterator<Suggestion> iter = suggestions.iterator(); iter.hasNext();) {
        String nextTerm = iter.next().getTerm().toLowerCase();
        if (!terms.add(nextTerm)) {
            iter.remove();
        }
    }

    long delay = System.currentTimeMillis() - start;
    ourLog.info("Provided {} suggestions for term {} in {} ms", new Object[] { terms.size(), theText, delay });

    return suggestions;
}

From source file:ca.uhn.fhir.jpa.dao.FulltextSearchSvcImpl.java

License:Apache License

@Override
public List<Suggestion> suggestKeywords(String theContext, String theSearchParam, String theText) {
    Validate.notBlank(theContext, "theContext must be provided");
    Validate.notBlank(theSearchParam, "theSearchParam must be provided");
    Validate.notBlank(theText, "theSearchParam must be provided");

    long start = System.currentTimeMillis();

    String[] contextParts = StringUtils.split(theContext, '/');
    if (contextParts.length != 3 || "Patient".equals(contextParts[0]) == false
            || "$everything".equals(contextParts[2]) == false) {
        throw new InvalidRequestException("Invalid context: " + theContext);
    }//  w w w .j  a  v a2s  . c  om
    Long pid = BaseHapiFhirDao.translateForcedIdToPid(contextParts[0], contextParts[1], myForcedIdDao);

    FullTextEntityManager em = org.hibernate.search.jpa.Search.getFullTextEntityManager(myEntityManager);

    QueryBuilder qb = em.getSearchFactory().buildQueryBuilder().forEntity(ResourceTable.class).get();

    //@formatter:off
    Query textQuery = qb.phrase().withSlop(2).onField("myContentText").boostedTo(4.0f)
            .andField("myContentTextEdgeNGram").boostedTo(2.0f).andField("myContentTextNGram").boostedTo(1.0f)
            .andField("myContentTextPhonetic").boostedTo(0.5f).sentence(theText.toLowerCase()).createQuery();

    Query query = qb.bool()
            .must(qb.keyword().onField("myResourceLinks.myTargetResourcePid").matching(pid).createQuery())
            .must(textQuery).createQuery();
    //@formatter:on

    FullTextQuery ftq = em.createFullTextQuery(query, ResourceTable.class);
    ftq.setProjection("myContentText");
    ftq.setMaxResults(20);

    List<?> resultList = ftq.getResultList();
    List<Suggestion> suggestions = Lists.newArrayList();
    for (Object next : resultList) {
        Object[] nextAsArray = (Object[]) next;
        String nextValue = (String) nextAsArray[0];

        try {
            MySuggestionFormatter formatter = new MySuggestionFormatter(theText, suggestions);
            Scorer scorer = new QueryScorer(textQuery);
            Highlighter highlighter = new Highlighter(formatter, scorer);
            Analyzer analyzer = em.getSearchFactory().getAnalyzer(ResourceTable.class);

            formatter.setAnalyzer("myContentTextPhonetic");
            highlighter.getBestFragments(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue,
                    10);

            formatter.setAnalyzer("myContentTextNGram");
            highlighter.getBestFragments(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue, 10);

            formatter.setFindPhrasesWith();
            formatter.setAnalyzer("myContentTextEdgeNGram");
            highlighter.getBestFragments(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue,
                    10);

            // formatter.setAnalyzer("myContentText");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentText", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextNGram");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextEdgeNGram");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue, 10);
            // formatter.setAnalyzer("myContentTextPhonetic");
            // highlighter.getBestFragments(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue, 10);
        } catch (Exception e) {
            throw new InternalErrorException(e);
        }

    }

    Collections.sort(suggestions);

    Set<String> terms = Sets.newHashSet();
    for (Iterator<Suggestion> iter = suggestions.iterator(); iter.hasNext();) {
        String nextTerm = iter.next().getTerm().toLowerCase();
        if (!terms.add(nextTerm)) {
            iter.remove();
        }
    }

    long delay = System.currentTimeMillis() - start;
    ourLog.info("Provided {} suggestions for term {} in {} ms", new Object[] { terms.size(), theText, delay });

    return suggestions;
}

From source file:com.bluedragon.search.search.QueryRun.java

License:Open Source License

private void addRow(IndexSearcher searcher, int docid, float score, int rank, int searchCount,
        int recordsSearched) throws CorruptIndexException, Exception {
    DocumentWrap document = new DocumentWrap(searcher.doc(docid));

    queryResultData.addRow(1);//from  w w  w .  j  av a 2s  .  com
    queryResultData.setCurrentRow(queryResultData.getSize());

    // Add in the standard columns that we know we have for every search
    queryResultData.setCell(1, new cfStringData(document.getId()));
    queryResultData.setCell(2, new cfStringData(document.getName()));
    queryResultData.setCell(3, new cfNumberData(score));
    queryResultData.setCell(4, new cfNumberData(searchCount));
    queryResultData.setCell(5, new cfNumberData(recordsSearched));
    queryResultData.setCell(6, new cfNumberData(rank + 1));

    String uC = queryAttributes.getUniqueColumn();

    // Now we do the custom ones
    List<IndexableField> fields = document.getDocument().getFields();
    Iterator<IndexableField> it = fields.iterator();
    while (it.hasNext()) {
        IndexableField fieldable = it.next();

        String fieldName = fieldable.name().toLowerCase();

        // Check for the unique
        if (uniqueSet != null && fieldName.equals(uC)) {
            if (uniqueSet.contains(fieldable.stringValue())) {
                queryResultData.deleteRow(queryResultData.getSize());
                return;
            } else
                uniqueSet.add(fieldable.stringValue());
        }

        // Check to see if we have this column
        if (fieldName.equals("contents") && !queryAttributes.getContentFlag())
            continue;

        if (!activeColumns.containsKey(fieldName)) {
            int newcolumn = queryResultData.addColumnData(fieldable.name().toUpperCase(),
                    cfArrayData.createArray(1), null);
            activeColumns.put(fieldName, newcolumn);
        }

        int column = activeColumns.get(fieldName);
        if (column <= 6)
            continue;

        queryResultData.setCell(column, new cfStringData(fieldable.stringValue()));
    }

    // Do the context stuff if enable
    if (queryAttributes.getContextPassages() > 0) {

        Scorer scorer = new QueryScorer(queryAttributes.getQuery());
        SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(queryAttributes.getContextHighlightStart(),
                queryAttributes.getContextHighlightEnd());
        Highlighter highlighter = new Highlighter(formatter, scorer);
        Fragmenter fragmenter = new SimpleFragmenter(queryAttributes.getContextBytes());
        highlighter.setTextFragmenter(fragmenter);

        String nextContext = "";
        String contents = document.getAttribute(DocumentWrap.CONTENTS);

        if (contents != null) {
            TokenStream tokenStream = AnalyzerFactory.get("simple").tokenStream(DocumentWrap.CONTENTS,
                    new StringReader(contents));
            String[] fragments = null;
            try {
                fragments = highlighter.getBestFragments(tokenStream, contents,
                        queryAttributes.getContextPassages());
                if (fragments.length == 1) {
                    nextContext = fragments[0] + "...";
                } else {
                    StringBuilder context = new StringBuilder();
                    for (int f = 0; f < fragments.length; f++) {
                        context.append("...");
                        context.append(fragments[f]);
                    }
                    context.append("...");
                    nextContext = context.toString();
                }
            } catch (Exception e) {
            }

            // Add in the context
            if (!activeColumns.containsKey("context")) {
                int newcolumn = queryResultData.addColumnData("CONTEXT", cfArrayData.createArray(1), null);
                activeColumns.put("context", newcolumn);
            }

            queryResultData.setCell(activeColumns.get("context"), new cfStringData(nextContext));
        }
    }
}

From source file:com.difference.historybook.index.lucene.LuceneIndex.java

License:Apache License

@Override
public SearchResultWrapper search(String collection, String query, int offset, int size, boolean includeDebug)
        throws IndexException {
    try {//from w ww .  j  av  a 2  s. co  m
        //TODO: make age be a component in the ranking?
        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
        queryBuilder.add(parser.parse(query), Occur.MUST);
        queryBuilder.add(new TermQuery(new Term(IndexDocumentAdapter.FIELD_COLLECTION, collection)),
                Occur.FILTER);
        Query baseQuery = queryBuilder.build();

        FunctionQuery boostQuery = new FunctionQuery(
                new ReciprocalFloatFunction(new DurationValueSource(new Date().getTime() / 1000,
                        new LongFieldSource(IndexDocumentAdapter.FIELD_TIMESTAMP)), RECIP, 1F, 1F));

        Query q = new CustomScoreQuery(baseQuery, boostQuery);

        QueryScorer queryScorer = new QueryScorer(q, IndexDocumentAdapter.FIELD_SEARCH);
        Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        Highlighter highlighter = new Highlighter(queryScorer);
        highlighter.setTextFragmenter(fragmenter);

        GroupingSearch gsearch = new GroupingSearch(IndexDocumentAdapter.FIELD_URL_GROUP).setGroupDocsLimit(1)
                .setAllGroups(true).setIncludeMaxScore(true);
        TopGroups<?> groups = gsearch.search(searcher, q, offset, size);

        ArrayList<SearchResult> results = new ArrayList<>(size);
        for (int i = offset; i < offset + size && i < groups.groups.length; i++) {
            ScoreDoc scoreDoc = groups.groups[i].scoreDocs[0];
            Document luceneDoc = searcher.doc(scoreDoc.doc);
            IndexDocumentAdapter doc = new IndexDocumentAdapter(luceneDoc);

            TokenStream tokenStream = TokenSources.getTokenStream(IndexDocumentAdapter.FIELD_SEARCH,
                    reader.getTermVectors(scoreDoc.doc), luceneDoc.get(IndexDocumentAdapter.FIELD_SEARCH),
                    analyzer, highlighter.getMaxDocCharsToAnalyze() - 1);

            String[] snippets = highlighter.getBestFragments(tokenStream,
                    luceneDoc.get(IndexDocumentAdapter.FIELD_SEARCH), 3);
            String snippet = Arrays.asList(snippets).stream().collect(Collectors.joining("\n"));
            snippet = Jsoup.clean(snippet, Whitelist.simpleText());

            String debugInfo = null;
            if (includeDebug) {
                Explanation explanation = searcher.explain(q, scoreDoc.doc);
                debugInfo = explanation.toString();
            }

            results.add(new SearchResult(doc.getKey(), doc.getCollection(), doc.getTitle(), doc.getUrl(),
                    doc.getDomain(), doc.getTimestampText(), snippet, debugInfo, scoreDoc.score));
        }

        SearchResultWrapper wrapper = new SearchResultWrapper().setQuery(query).setOffset(offset)
                .setMaxResultsRequested(size)
                .setResultCount(groups.totalGroupCount != null ? groups.totalGroupCount : 0)
                .setResults(results);

        if (includeDebug) {
            wrapper.setDebugInfo(q.toString());
        }

        return wrapper;

    } catch (IOException | ParseException | InvalidTokenOffsetsException e) {
        LOG.error(e.getLocalizedMessage());
        throw new IndexException(e);
    }
}

From source file:com.ecyrd.jspwiki.search.LuceneSearchProvider.java

License:Apache License

/**
 *  Searches pages using a particular combination of flags.
 *
 *  @param query The query to perform in Lucene query language
 *  @param flags A set of flags/*  ww w.  j  a v  a  2  s .  c  o  m*/
 *  @return A Collection of SearchResult instances
 *  @throws ProviderException if there is a problem with the backend
 */
public Collection findPages(String query, int flags) throws ProviderException {
    Searcher searcher = null;
    ArrayList<SearchResult> list = null;
    Highlighter highlighter = null;

    try {
        String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS };
        QueryParser qp = new MultiFieldQueryParser(queryfields, getLuceneAnalyzer());

        //QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
        Query luceneQuery = qp.parse(query);

        if ((flags & FLAG_CONTEXTS) != 0) {
            highlighter = new Highlighter(new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"),
                    new SimpleHTMLEncoder(), new QueryScorer(luceneQuery));
        }

        try {
            searcher = new IndexSearcher(m_luceneDirectory);
        } catch (Exception ex) {
            log.info("Lucene not yet ready; indexing not started", ex);
            return null;
        }

        Hits hits = searcher.search(luceneQuery);

        list = new ArrayList<SearchResult>(hits.length());
        for (int curr = 0; curr < hits.length(); curr++) {
            Document doc = hits.doc(curr);
            String pageName = doc.get(LUCENE_ID);
            WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);

            if (page != null) {
                if (page instanceof Attachment) {
                    // Currently attachments don't look nice on the search-results page
                    // When the search-results are cleaned up this can be enabled again.
                }

                int score = (int) (hits.score(curr) * 100);

                // Get highlighted search contexts
                String text = doc.get(LUCENE_PAGE_CONTENTS);

                String[] fragments = new String[0];
                if (text != null && highlighter != null) {
                    TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS,
                            new StringReader(text));
                    fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);

                }

                SearchResult result = new SearchResultImpl(page, score, fragments);
                list.add(result);
            } else {
                log.error("Lucene found a result page '" + pageName
                        + "' that could not be loaded, removing from Lucene cache");
                pageRemoved(new WikiPage(m_engine, pageName));
            }
        }
    } catch (IOException e) {
        log.error("Failed during lucene search", e);
    } catch (InstantiationException e) {
        log.error("Unable to get a Lucene analyzer", e);
    } catch (IllegalAccessException e) {
        log.error("Unable to get a Lucene analyzer", e);
    } catch (ClassNotFoundException e) {
        log.error("Specified Lucene analyzer does not exist", e);
    } catch (ParseException e) {
        log.info("Broken query; cannot parse", e);

        throw new ProviderException("You have entered a query Lucene cannot process: " + e.getMessage());
    } finally {
        if (searcher != null) {
            try {
                searcher.close();
            } catch (IOException e) {
            }
        }
    }

    return list;
}

From source file:com.gauronit.tagmata.core.Indexer.java

License:Open Source License

public ArrayList<CardSnapshot> search(String searchText, ArrayList<String> indexNames, boolean searchInTitle,
        boolean searchInTags, boolean searchInText, boolean superFuzzy) {
    ArrayList<CardSnapshot> cardSnaps = new ArrayList();
    try {//from  ww w.ja  v a  2  s  . c  om
        ArrayList<IndexSearcher> searchers = new ArrayList<IndexSearcher>();

        for (String indexName : indexNames) {
            IndexReader reader = IndexReader
                    .open(FSDirectory.open(new File(indexDir + File.separator + indexName),
                            new SimpleFSLockFactory(indexDir + File.separator + indexName)));
            IndexSearcher searcher = new IndexSearcher(reader);
            searchers.add(searcher);
        }

        BooleanQuery query = new BooleanQuery();
        if (searchInTitle) {
            IndexerUtil.getTokenizedQuery(query, "title", searchText, superFuzzy);
        }
        if (searchInTags) {
            IndexerUtil.getTokenizedQuery(query, "tags", searchText, superFuzzy);
        }
        if (searchInText) {
            IndexerUtil.getTokenizedQuery(query, "text", searchText, superFuzzy);
            IndexerUtil.getTokenizedQuery(query, "analyzedText", searchText, superFuzzy);
        }

        for (IndexSearcher searcher : searchers) {
            TopScoreDocCollector collector = TopScoreDocCollector.create(10000, false);
            searcher.search(query, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            for (ScoreDoc hit : hits) {
                Document doc = searcher.doc(hit.doc);

                TokenStream stream = TokenSources.getTokenStream("text", doc.get("analyzedText"),
                        new StandardAnalyzer(Version.LUCENE_20.LUCENE_35));
                QueryScorer scorer = new QueryScorer(query, "analyzedText");
                Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 20);
                Highlighter highlighter = new Highlighter(scorer);
                highlighter.setTextFragmenter(fragmenter);
                String[] fragments = highlighter.getBestFragments(stream, doc.get("text"), 5);
                String highlights = "";

                for (String fragment : fragments) {
                    highlights += fragment + "...";
                }

                if (highlights.equals("")) {
                    String text = doc.get("text");
                    if (text.length() > 100) {
                        highlights += doc.get("text").substring(0, 100);
                    } else {
                        highlights += doc.get("text");
                    }
                }

                cardSnaps.add(new CardSnapshot(highlights, doc));
            }
            searcher.getIndexReader().close();
            searcher.close();
            searcher = null;
        }

    } catch (Exception ex) {
        ex.printStackTrace();
    }
    return cardSnaps;
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1, String s2, String radioBtn)
        throws IOException, ParseException, InvalidTokenOffsetsException {
    //getting reference of directory
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));

    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words, takes out the stop words
    Analyzer analyzer = new StandardAnalyzer();

    String contents = "contents";

    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }/* ww w. j av a  2 s  . c om*/

    Query q1 = parser.parse(s1);
    Query q2 = parser.parse(s2);

    //conjuction, disjunction and negation
    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    //occur.must : both queries required in a doc
    if (radioBtn.equals("conjunction")) {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST);
        bq.build();
    } //occur.should: one of the q1 should be presen t in doc
    else if (radioBtn.equals("disjunction")) {
        bq.add(q1, BooleanClause.Occur.SHOULD);
        bq.add(q2, BooleanClause.Occur.SHOULD);
        bq.build();
    } //negation: first should present , second should not
    else {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST_NOT);
        bq.build();
    }

    TopDocs hits = searcher.search(bq.build(), 10);

    Formatter formatter = new SimpleHTMLFormatter();

    QueryScorer scorer = new QueryScorer(bq.build());

    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);

    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();

        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);

        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1) throws IOException, ParseException, InvalidTokenOffsetsException {
    //Get directory reference
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));
    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);
    //CreateIndexReader reader = DirectoryReader.open(dir); lucene searcher. It search over a single IndexReader.
    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words
    Analyzer analyzer = new StandardAnalyzer();
    //Query parser to be used for creating TermQuery

    String queries = null;/*from  ww w  .  ja va  2 s.  c  o  m*/
    String queryString = null; //regular search
    String contents = "contents";
    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }

    Query q1 = parser.parse(s1);

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    bq.add(q1, BooleanClause.Occur.MUST);
    //Search the lucene documents
    TopDocs hits = searcher.search(bq.build(), 10);
    // TopScoreDocCollector collector = TopScoreDocCollector.create(5);
    /**
     * Highlighter Code Start ***
     */
    //Uses HTML &lt;B&gt;&lt;/B&gt; tag to highlight the searched terms
    Formatter formatter = new SimpleHTMLFormatter();
    //It scores cont fragments by the number of unique q1 terms found
    //Basically the matching score in layman terms
    QueryScorer scorer = new QueryScorer(bq.build());
    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);
    //Iterate over found results
    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();
        //int rank = hits.scoreDocs.length;
        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);
        // String title = doc.get("title");
        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.oneis.app.SearchResultExcerptHighlighter.java

License:Mozilla Public License

static public String[] bestHighlightedExcerpts(String escapedText, String searchTerms, int maxExcerptLength) {
    try {/*from w ww .j  a va  2s  .  c  o m*/
        // Scorer selects the terms which need highlighting. Created from a 'query' based on the extracted search terms.
        Scorer scorer;
        Fragmenter fragmenter;
        if (searchTerms != null && searchTerms.length() > 0) {
            QueryParser queryParser = new QueryParser("FIELD", new StandardAnalyzer());
            Query query = queryParser.parse(searchTerms);
            scorer = new QueryScorer(query);
            fragmenter = new SimpleSpanFragmenter((QueryScorer) scorer, maxExcerptLength);
        } else {
            scorer = new NoHighlightingScorer();
            fragmenter = new SimpleFragmenter(maxExcerptLength);
        }

        // Parse the escaped text into tokens, which retain the positions in the text
        StandardAnalyzer analyser = new StandardAnalyzer();
        TokenStream tokenStream = analyser.tokenStream("FIELD", new StringReader(escapedText));

        // Finally, do the highlighting!
        Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"), scorer);
        highlighter.setTextFragmenter(fragmenter);
        return highlighter.getBestFragments(tokenStream, escapedText, NUMBER_OF_FRAGMENTS);
    } catch (Exception e) {
        Logger.getLogger("com.oneis.app").info("Exception in SearchResultExcerptHighlighter: ", e);
        return null;
    }
}

From source file:com.pongasoft.kiwidoc.index.impl.keyword.impl.KeywordIndexImpl.java

License:Apache License

/**
 * Highlights the provided results obtained using the provided query.
 *
 * @param query  the query from which the results were computed
 * @param models the models to highlight
 * @return a map representing for each entry in the model its associated resource and highlight
 * @throws MalformedQueryException if the query cannot be parsed
 * @throws InternalException if there is an internal problem
 *//*from  ww  w .jav  a 2s .  c o  m*/
public <R extends Resource> Map<R, String[]> highlightResults(KeywordQuery query, Collection<Model<R>> models)
        throws InternalException, MalformedQueryException {
    Map<R, String[]> res = new LinkedHashMap<R, String[]>();

    Query parsedQuery = parseQuery(query);

    if (parsedQuery != null) {
        Highlighter highlighter = new Highlighter(_highlighterFormatter, HTML_ENCODER,
                new QueryScorer(parsedQuery));

        for (Model<R> model : models) {
            Document document = new Document();
            String bodyText = buildBody(model);
            document.add(new Field(DocumentFactory.BODY_FIELD, bodyText, Field.Store.NO, Field.Index.ANALYZED));
            TokenStream tokenStream = TokenSources.getTokenStream(document, DocumentFactory.BODY_FIELD,
                    _analyzer);
            try {
                res.put(model.getResource(), highlighter.getBestFragments(tokenStream, bodyText, 2));
            } catch (IOException e) {
                log.warn("exception while computing highlight... [ignored]", e);
            }
        }
    }

    return res;
}