public final String getBestFragments(TokenStream tokenStream, String text, int maxNumFragments,
        String separator) throws IOException, InvalidTokenOffsetsException 

Highlights terms in the text , extracting the most relevant sections and concatenating the chosen fragments with a separator (typically "...").


From source file:aos.lucene.tools.HighlightIt.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        System.err.println("Usage: HighlightIt <filename-out>");
        String filename = args[0];

    String filename = args[0];

    String searchText = "term"; //
    QueryParser parser = new QueryParser(Version.LUCENE_46, //
            "f", //
            new StandardAnalyzer(Version.LUCENE_46));// #1
    Query query = parser.parse(searchText); //

    SimpleHTMLFormatter formatter = //
            new SimpleHTMLFormatter("<span class=\"highlight\">", //
                    "</span>"); //

    TokenStream tokens = new StandardAnalyzer(Version.LUCENE_46) //
            .tokenStream("f", new StringReader(text)); //

    QueryScorer scorer = new QueryScorer(query, "f"); //

    Highlighter highlighter = new Highlighter(formatter, scorer); //
    highlighter.setTextFragmenter( //
            new SimpleSpanFragmenter(scorer)); //

    String result = //
            highlighter.getBestFragments(tokens, text, 3, "..."); //

    FileWriter writer = new FileWriter(filename); //
    writer.write("<html>"); //
    writer.write("<style>\n" + //
            ".highlight {\n" + //
            " background: yellow;\n" + //
            "}\n" + //
            "</style>"); //
    writer.write("<body>"); //
    writer.write(result); //
    writer.write("</body></html>"); //
    writer.close(); //

From source file:blackbelt.lucene.testHighlight.MainHighlight.java

License:Open Source License

public static void main(String[] args) throws ParseException, IOException {

    String keyWord = "hibernate";
    String language = "en";
    String text = "Hibernate is an object-relational mapping (ORM) library for the Java language,"
            + "providing a framework for mapping an object-oriented domain model to a traditional relational"
            + "database. Hibernate solves object-relational impedance mismatch problems by replacing direct "
            + "persistence-related database accesses with high-level object handling functions. "
            + "Hibernate is free software that is distributed under the GNU Lesser General Public License. "
            + "Hibernate's primary feature is mapping from Java classes to database tables "
            + "(and from Java data types to SQL data types). Hibernate also provides data query"
            + " and retrieval facilities. Hibernate generates the SQL calls and attempts to relieve"
            + " the developer from manual result set handling and object conversion and keep the application"
            + " portable to all supported SQL databases with little performance overhead.";
    String result;

    QueryParser parser = new QueryParser(Version.LUCENE_30, "title", new StandardAnalyzer(Version.LUCENE_30));
    Query query = parser.parse(keyWord);

    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>");
    TokenStream tokens = new StandardAnalyzer(Version.LUCENE_30).tokenStream("title", new StringReader(text));

    QueryScorer scorer = new QueryScorer(query, "title");
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 85));

    try {
        result = highlighter.getBestFragments(tokens, text, 4, "<BR/>...");
        System.out.println("\n" + result.length());
    } catch (InvalidTokenOffsetsException e) {
        throw new RuntimeException(e);

    result = "<html><body>" + result + "</body></html>";
    File file = new File("C:\\Users\\forma702\\Desktop\\testHighlight.html");
    try {
        PrintWriter pw = new PrintWriter(file);
    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block

From source file:ca.dracode.ais.indexer.FileSearcher.java

License:Open Source License

 * Takes a list of Documents and highlights information relevant to a given Query
 * @param docs The documents to highlight
 * @param qry The query used to highlight the documents
 * @param type The type of the search, one of QUERY_BOOLEAN,
 *             which just notes the page on which the term exists or QUERY_STANDARD,
 *             which gives highlighted fragments and the page on which they exist.
 * @param term The term that created the query
 * @param maxResults The maximum number of results that will be returned
 * @return A SearchResult containing the results sorted by relevance and page
 *///ww w . j  a v  a2s. com
private SearchResult getHighlightedResults(List<Document> docs, Query qry, int type, String term,
        int maxResults) {
    try {
        int numResults = 0;
        LinkedHashMap<String, LinkedHashMap<Integer, List<String>>> results = new LinkedHashMap<String, LinkedHashMap<Integer, List<String>>>();
        for (int i = 0; i < docs.size() && numResults < maxResults; i++) {
            Document d = docs.get(i);
            int docPage = Integer.parseInt(d.get("page"));
            String name = d.get("path");
            LinkedHashMap<Integer, List<String>> docResult = results.get(name);
            if (docResult == null) {
                docResult = new LinkedHashMap<Integer, List<String>>();
                results.put(name, docResult);
            if (type != FileSearcher.QUERY_BOOLEAN) {
                String contents = d.get("text");
                Highlighter highlighter = new Highlighter(new QueryScorer(qry));

                String[] frag = null;
                try {
                    frag = highlighter.getBestFragments(new SimpleAnalyzer(Version.LUCENE_47), "text", contents,
                            maxResults - numResults);
                    numResults += frag.length;
                } catch (IOException e) {
                    Log.e(TAG, "Error while reading index", e);
                } catch (InvalidTokenOffsetsException e) {
                    Log.e(TAG, "Error while highlighting", e);
                if (frag != null) {
                    Log.i(TAG, "Frags: " + frag.length + " " + frag + " " + frag[0]);
                ArrayList<String> tmpList = new ArrayList<String>(
                        Arrays.asList(frag != null ? frag : new String[0]));
                Log.i(TAG, "list " + tmpList.getClass().getName());
                docResult.put(docPage, tmpList);
            } else {
                ArrayList<String> tmp = new ArrayList<String>();
                docResult.put(docPage, tmp);

        Log.i(TAG, "" + results.size());
        return new SearchResult(results);
    } catch (Exception e) {
        Log.e("TAG", "Error while Highlighting", e);
        return null;

From source file:cn.hbu.cs.esearch.service.impl.EsearchSearchServiceImpl.java

License:Apache License

public SearchResult search(SearchRequest sResquest) throws EsearchException {
    try {/*from  w  w  w  .  j  a  va2  s  .co m*/
    } catch (EsearchException e) {
        LOGGER.error("Esearch flush events error. \n{}", e);
    String queryString = sResquest.getQuery();
    String queryField = sResquest.getField();
    LOGGER.info("The search request coming: queryField:{},queryString:{}", queryField, queryString);

    Analyzer analyzer = esearchSystem.getAnalyzer();
    QueryParser queryParser = new QueryParser(Version.LUCENE_43, queryField, analyzer);
    SearchResult result = new SearchResult();

    List<EsearchMultiReader<R>> readers = null;
    MultiReader multiReader = null;
    IndexSearcher searcher = null;
    try {
        Query query = null;
        if (Strings.isNullOrEmpty(queryString)) {
            query = new MatchAllDocsQuery();
        } else {
            query = queryParser.parse(queryString);
        readers = esearchSystem.getIndexReaders();
        multiReader = new MultiReader(readers.toArray(new IndexReader[readers.size()]), false);
        searcher = new IndexSearcher(multiReader);
        long start = System.currentTimeMillis();
        TopDocs docs = searcher.search(query, null, sResquest.getSize());
        long end = System.currentTimeMillis();

        result.setTime(end - start);

        LOGGER.info("Got {} hits. Cost:{} ms", docs.totalHits, end - start);

        if (sResquest.getSearchType() == SearchRequest.SearchType.COUNT) {
            return result;

        ScoreDoc[] scoreDocs = docs.scoreDocs;
        ArrayList<SearchHit> hitList = new ArrayList<SearchHit>(scoreDocs.length);
        for (ScoreDoc scoreDoc : scoreDocs) {
            SearchHit hit = new SearchHit();
            int docID = scoreDoc.doc;

            Document doc = multiReader.document(docID);
            String content = doc.get(queryField);

            Scorer qs = new QueryScorer(query);

            SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"hl\">", "</span>");
            Highlighter hl = new Highlighter(formatter, qs);
            String[] fragments = hl.getBestFragments(analyzer, queryField, content, 1);

            Map<String, String[]> fields = convert(doc, sResquest.getSearchType());
            fields.put("fragment", fragments);
        result.setHits(hitList.toArray(new SearchHit[hitList.size()]));
        return result;
    } catch (Exception e) {
        LOGGER.error(e.getMessage(), e);
        throw new EsearchException(e.getMessage(), e);
    } finally {
        if (multiReader != null) {
            try {
            } catch (IOException e) {
                LOGGER.error(e.getMessage(), e);

From source file:com.appeligo.alerts.KeywordAlertThread.java

License:Apache License

 * @param searchExecutor callback to get the set of hits for the given query. This can be
 * executed in different ways./*from w w w .  j ava  2 s.c  o  m*/
 * @return true if we hit too many consecutive exceptions so we broke out of the loop
private boolean executeKeywordSearch(SearchExecutor searchExecutor, String messagePrefix,
        boolean groupQueries) {
    ChunkedResults<KeywordAlert> results = KeywordAlert.getAllInNormalizedQueryOrder();
    Hits hits = null;
    String lastNormalizedQuery = null;
    Query lastLuceneQuery = null;
    int consecutiveExceptions = 0;
    while (results.next() && isActive()) {
        KeywordAlert keywordAlert = results.get();
        try {
            if (keywordAlert.isDeleted() || keywordAlert.isDisabled()) {
                if (log.isDebugEnabled())
                    log.debug("keyword alert is deleted or disabled");
            User user = keywordAlert.getUser();
            if (user == null) {
                if (log.isDebugEnabled())
                    log.debug("keyword alert is implicitly deleted (user is null)");

            if (helper.maxAlertsExceeded(keywordAlert)) {

            if (groupQueries) {
                if ((hits == null) || (!keywordAlert.getNormalizedQuery().equals(lastNormalizedQuery))) {
                    hits = searchExecutor.search(null, keywordAlert.getNormalizedQuery());
                    lastLuceneQuery = searchExecutor.getLuceneQuery();
                } else if (log.isDebugEnabled())
                    log.debug("Not searching on " + keywordAlert.getNormalizedQuery() + " again");
            } else {
                hits = searchExecutor.search(keywordAlert.getUser().getLineupId(),
                // Note that I'm searching with the lineup from the user, which will
                // only ensure that the liveIndex doesn't return shows that don't ever
                // play for this lineup.  However, it does not guarantee that the show
                // on this user's lineup is playing at the same time (meaning alerts
                // might tell the user of a show that is only in the future).
                lastLuceneQuery = searchExecutor.getLuceneQuery();
            lastNormalizedQuery = keywordAlert.getNormalizedQuery();
            Highlighter highlighter = new Highlighter(new TermFormatter(), new QueryScorer(lastLuceneQuery));
            PorterStemAnalyzer analyzer = new PorterStemAnalyzer(LuceneIndexer.STOP_WORDS);

            for (int i = 0; i < hits.length(); i++) {
                Document doc = hits.doc(i);

                if (!isActive()) {

                //                 if (groupQueries && (!"true".equals(doc.get("lineup-"+keywordAlert.getUser().getLineupId())))) {
                if (groupQueries
                        && (doc.get("lineup-" + keywordAlert.getUser().getLineupId() + "-startTime") == null)) {
                    // This "if" statement checks to make sure the program is or did play on the user's
                    // lineup, which might be on a different station, a different time, past or future.
                    if (log.isDebugEnabled())
                        log.debug(doc.get("programTitle") + " matched on " + keywordAlert.getNormalizedQuery()
                                + " but it isn't airing on this user's lineup anytime soon.");

                Transaction transaction = HibernateUtil.currentSession().beginTransaction();
                try {
                    if ((!helper.maxAlertsExceeded(keywordAlert)) && helper.isNewMatch(keywordAlert, doc)) {
                        if (log.isDebugEnabled())
                            log.debug("KeywordAlertThread found match in " + doc.get("programTitle") + " for "
                                    + keywordAlert.getNormalizedQuery() + "... sending messages");
                        String text = doc.get("text");
                        String fragments = null;
                        if (text != null) {
                            TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(text));
                            fragments = highlighter.getBestFragments(tokenStream, text, 3, "...");

                        helper.sendMessages(keywordAlert, fragments, doc, messagePrefix);
                    } else if (log.isDebugEnabled())
                        log.debug("KeywordAlertThread found match in " + doc.get("programTitle") + " for "
                                + keywordAlert.getNormalizedQuery()
                                + " but max exceeded or we already matched this one");
                } catch (Throwable t) {
                            "Error processing keyword alerts when searching live lucene index. Rolling back transaction.",
                } finally {
                    if (!transaction.wasRolledBack()) {
            consecutiveExceptions = 0;
        } catch (Throwable t) {
            User user = keywordAlert.getUser();
            log.error("Caught throwable on keyword " + keywordAlert.getId() + ", " + keywordAlert.getUserQuery()
                    + ", user " + ((user == null) ? null : user.getUsername()), t);
            if (consecutiveExceptions >= maxConsecutiveExceptions) {
                return true;
    return false;

From source file:com.appeligo.search.actions.SearchResults.java

License:Apache License

private void addDocument(Document doc, float score, EPGProvider epgProvider, Highlighter highlighter,
        Analyzer analyzer, ScheduledProgram next, ScheduledProgram last, Program programInfo)
        throws IOException {
    String text = doc.get("text");
    String fragments = null;/* ww w . ja  v  a2  s. c  o m*/
    if (text != null) {
        TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(text));
        fragments = highlighter.getBestFragments(tokenStream, text, 3, "...");
    SearchResult searchResult = new SearchResult(lineup, new DocumentWrapper(doc, score, fragments),
            programInfo, last, next);
    programToSearchResult.put(doc.get("programID"), searchResult);

From source file:com.bewsia.script.safe.lucene.SEntity.java

License:Open Source License

public String highlight(Query query, String text, String field, int fragmentSize, int maxNumFragments,
        String separator) throws Exception {
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
    CachingTokenFilter tokenStream = new CachingTokenFilter(
            analyzer.tokenStream(field, new StringReader(text)));
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
    Scorer scorer = new org.apache.lucene.search.highlight.QueryScorer(query);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(new SimpleFragmenter(fragmentSize));
    tokenStream.reset();//from www.ja v  a2s .  c o m
    String rv = highlighter.getBestFragments(tokenStream, text, maxNumFragments, separator);
    return rv.length() == 0 ? text : rv;

From source file:com.bugull.mongo.lucene.BuguHighlighter.java

License:Apache License

public String getResult(String fieldName, String fieldValue) throws Exception {
    BuguIndex index = BuguIndex.getInstance();
    QueryParser parser = new QueryParser(index.getVersion(), fieldName, index.getAnalyzer());
    Query query = parser.parse(keywords);
    TokenStream tokens = index.getAnalyzer().tokenStream(fieldName, new StringReader(fieldValue));
    QueryScorer scorer = new QueryScorer(query, fieldName);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
    return highlighter.getBestFragments(tokens, fieldValue, maxFragments, "...");

From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java

License:Open Source License

public String highlight(String strToHighlight, String fieldName, Query luceneQuery) {
    String highlightedText;/* w  w w  . ja v a2s.  c  o  m*/
    Analyzer analyzer = analyzerProvider.getAnalyzer(Locale.FRENCH);
    try {
        Directory directory = FSDirectory.open(indexDir);
        IndexReader indexReader = DirectoryReader.open(directory);
        Query rewrittenLuceneQuery = luceneQuery.rewrite(indexReader);
        QueryScorer luceneScorer = new QueryScorer(rewrittenLuceneQuery);
        SimpleHTMLFormatter luceneFormatter = new SimpleHTMLFormatter("<span class=\"hit\">", "</span>");
        Highlighter luceneHighlighter = new Highlighter(luceneFormatter, luceneScorer);

        Fragmenter luceneFragmenter;
        // Si la chaine  highlighter est sup  250 carac
        if (strToHighlight.length() > TAILLE_CHAINE_NON_FRAGMENTEE) {
            // Cration de best fragments de 100 carac chaque
            luceneFragmenter = new SimpleFragmenter(TAILLE_FRAGMENT);
        } else {
            // Toute la chaine est highlight
            luceneFragmenter = new SimpleFragmenter(Integer.MAX_VALUE);

        TokenStream luceneTokenStream = analyzer.tokenStream(fieldName, new StringReader(strToHighlight));
        String fragment = null;
        if (strToHighlight.length() > TAILLE_CHAINE_NON_FRAGMENTEE) {
            fragment = luceneHighlighter.getBestFragments(luceneTokenStream, strToHighlight, NB_BEST_FRAGMENT,
        } else {
            fragment = luceneHighlighter.getBestFragment(luceneTokenStream, strToHighlight);

        if (StringUtils.isBlank(fragment) && fieldName.equalsIgnoreCase("titre")) {
            fragment = strToHighlight;

        highlightedText = fragment;
    } catch (IOException e) {
        throw new RuntimeException(e);
    } catch (InvalidTokenOffsetsException e) {
        throw new RuntimeException(e);
    return highlightedText;

From source file:com.edgenius.wiki.search.service.AbstractSearchService.java

License:Open Source License

 * Match all given name-value pairs, return combined fragment. For example, spaceUname and space desc have matched
 * fragment, then these 2 pieces are merge into one String fragment and return.
 * @param namedValues// w  w w  .j a  v a2  s . co m
 * @return
 * @throws IOException
private String createFragment(Highlighter hl, String content) throws IOException {
    if (content == null)
        return "";

    if (hl == null)
        return content;

    TokenStream tokenStream = searcherFactory.getAnalyzer().tokenStream(FieldName.CONTENT,
            new StringReader(content));
    String frag;
    try {
        frag = hl.getBestFragments(tokenStream, content, 3, "...");
    } catch (InvalidTokenOffsetsException e) {
        log.error("Highlight fragment error", e);
        frag = StringUtils.abbreviate(content, FRAGMENT_LEN);

    return frag;