Example usage for org.apache.lucene.search.highlight Highlighter Highlighter

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter Highlighter.

Prototype

public Highlighter(Scorer fragmentScorer)

Source Link

Usage

From source file:de.hsmannheim.ss15.alr.searchengine.DefaultLuceneController.java

public List<StoredDocument> doSearch(String queryString) throws IOException, ParseException {
    String field = "contents";
    String queries = null;/*from   w w  w. jav a2  s .c o  m*/
    boolean raw = false;
    int hitsPerPage = 10;

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer();

    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(field, analyzer);

    Query query = parser.parse(queryString);

    Highlighter highlighter = new Highlighter(new QueryScorer(query));

    TotalHitCountCollector collector = new TotalHitCountCollector();
    searcher.search(query, collector);
    TopDocs topDocs = searcher.search(query, Math.max(1, collector.getTotalHits()));

    List<StoredDocument> results = new ArrayList<>();
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
        StoredDocument doc = searcher.doc(scoreDoc.doc);
        try {
            File file = new File(doc.get("path"));
            BufferedReader docReader = new BufferedReader(
                    new InputStreamReader(Files.newInputStream(file.toPath()), StandardCharsets.UTF_8));

            List<String> lines = new ArrayList<>();
            while (docReader.ready()) {
                lines.add(docReader.readLine());
            }
            lines.remove(0);
            lines.remove(0);
            lines.remove(0);

            String content = "";

            for (String s : lines) {
                content = content + s;
            }
            String highLight = highlighter.getBestFragment(analyzer, null, content);
            if (highLight == null) {
                LOGGER.warn("No Highlight found");
            } else {
                doc.add(new TextField("highlight", highLight, Field.Store.YES));
            }
        } catch (InvalidTokenOffsetsException ex) {
            LOGGER.warn("No Highlight found");
        }

        results.add(doc);
    }

    reader.close();
    return results;

}

From source file:de.spartusch.nasfvi.server.NSearcher.java

License:Apache License

/**
 * Extracts a field's values from a document. This method is aware of
 * <i>collapsed</i> or <i>merged</i> fields and handles them properly. 
 * @param nquery NQuery used for searching
 * @param doc Document to extract the field's values from
 * @param field Name of the field to extract values for
 * @return Set of extracted values/*from w  w  w. ja  v  a2s  .co  m*/
 */
private Set<String> extractValues(final NQuery nquery, final Document doc, final String field) {
    Set<String> values = new HashSet<String>();

    if (NQuery.isFieldToCollapse(field)) {
        // process merged field
        String mfield = NQuery.getMergedField();
        QueryScorer scorer = new QueryScorer(nquery.getQuery(), mfield);
        Highlighter highlighter = new Highlighter(scorer);
        highlighter.setTextFragmenter(new NullFragmenter());

        try {
            Set<String> buffer = new HashSet<String>();

            for (Fieldable f : doc.getFieldables(mfield)) {
                String content = f.stringValue();
                String value = normalizeValue(NQuery.extractValue(field, content));

                // Test if the field was matched by the query
                TokenStream ts = TokenSources.getTokenStream(mfield, content, nquery.getAnalyzer());
                if (highlighter.getBestFragment(ts, content) != null) {
                    values.add(value);
                } else {
                    // Buffer the value - in case no field matches
                    buffer.add(value);
                }
            }

            if (values.isEmpty()) {
                // No field was matched by the query
                values.addAll(buffer);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        } catch (InvalidTokenOffsetsException e) {
            throw new RuntimeException(e);
        }
    } else {
        for (String v : doc.getValues(field)) {
            values.add(normalizeValue(v));
        }
    }

    return values;
}

From source file:docet.engine.SimpleDocetDocSearcher.java

License:Apache License

@Override
public List<DocetPage> searchForMatchingDocuments(final String searchText, final String lang,
        final int maxNumResults) throws DocetDocumentSearchException {
    final List<DocetPage> results = new ArrayList<>();
    final String fallbackLang = this.getFallbackLangForLang(lang);
    final String actualSearchLang;
    if (fallbackLang.isEmpty()) {
        actualSearchLang = lang;/*from  w ww  .j  av  a 2  s.  c  o  m*/
    } else {
        actualSearchLang = fallbackLang;
    }
    try {
        final IndexSearcher searcher = new IndexSearcher(reader);
        final Analyzer analyzer = new AnalyzerBuilder().language(actualSearchLang).build();
        QueryParser queryParser = new QueryParser(LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang, analyzer);
        final Query query = queryParser.parse(constructLucenePhraseTermSearchQuery(searchText));
        final QueryScorer queryScorer = new QueryScorer(query, LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang);

        final Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        final Highlighter highlighter = new Highlighter(queryScorer);
        highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
        highlighter.setTextFragmenter(fragmenter);

        final TopDocs res = searcher.search(query, maxNumResults);
        final float maxScore = res.getMaxScore();
        final List<ScoreDoc> scoreDocs = Arrays.asList(res.scoreDocs);
        Map<org.apache.lucene.document.Document, String> docs = new HashMap<>();
        Map<String, ScoreDoc> scoresForDocs = new HashMap<>();
        for (final ScoreDoc sd : scoreDocs) {
            final org.apache.lucene.document.Document doc = searcher.doc(sd.doc);
            final String contents = doc.get(LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang);
            final String docId = doc.get("id");
            final String[] fragments = highlighter.getBestFragments(analyzer,
                    LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang, contents, MAX_NUM_FRAGMENTS);
            List<String> fragmentList = Arrays.asList(fragments);
            fragmentList = fragmentList.stream().map(s1 -> s1.trim().split("\n"))
                    .map(s1 -> Arrays.asList(s1).stream().filter(s -> !s.trim().isEmpty())
                            .reduce((sa, sb) -> sa + MACHING_EXCERPTS_SEPARATOR + sb)
                            .orElse(MACHING_EXCERPTS_SEPARATOR))
                    .collect(Collectors.toList());
            docs.put(doc,
                    MACHING_EXCERPTS_SEPARATOR
                            + fragmentList.stream().filter(s -> !s.isEmpty())
                                    .reduce((s1, s2) -> s1 + "..." + s2).orElse("")
                            + MACHING_EXCERPTS_SEPARATOR);
            scoresForDocs.putIfAbsent(docId, sd);
        }
        docs.entrySet().stream().forEach(e -> {
            final int relevance = Math.round((scoresForDocs.get(e.getKey().get("id")).score / maxScore) * 100);
            results.add(DocetPage.toDocetDocument(e.getKey(), e.getValue(), relevance));
        });
        return results;
    } catch (ParseException | IOException | InvalidTokenOffsetsException ex) {
        throw new DocetDocumentSearchException(
                "Error on searching query " + searchText + " for lang " + actualSearchLang, ex);
    }
}

From source file:drakkar.mast.retrieval.LuceneContext.java

/**
 * Para la sumarizacin/*from  www.j a  v a 2s  .com*/
 *
 * @return
 */
private String getHighlighter(Query q, Analyzer a, String text, String field) {

    String summary = null;

    this.hg = new Highlighter(new QueryTermScorer(q));
    this.hg.setTextFragmenter(new SimpleFragmenter(20));
    this.hg.setMaxDocCharsToAnalyze(600);

    try {
        try {
            this.tokens = TokenSources.getTokenStream(field, text, a);
            summary = this.hg.getBestFragments(this.tokens, text, 20, "...");
            // summary = this.hg.getBestFragments(this.tokens, text, 10).toString();
        } catch (IOException ex) {
            OutputMonitor.printStream("IO", ex);
        }
    } catch (InvalidTokenOffsetsException ex) {
        OutputMonitor.printStream("", ex);
    }

    if (summary == null) {
        summary = " ";
    }
    return summary;
}

From source file:engine.easy.search.EasySearchEngine.java

License:Apache License

public String highlightedText() {

    try {/*from  w ww  .  j av  a  2  s.  c  o  m*/
        Analyzer analyzer = new EasySearchAnalyzer();

        PhraseQuery phraseQuery = new PhraseQuery();
        phraseQuery.add(new Term("CONTENT", "KENNEDY"));
        phraseQuery.add(new Term("CONTENT", "ADMINISTRATION"));

        Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH));
        IndexReader indexReader = IndexReader.open(indexDir);

        Query query = getQuery(phraseQuery.toString());
        QueryScorer scorer = new QueryScorer(query, AppConstants.CONTENT_FIELD);
        Highlighter highlighter = new Highlighter(scorer);

        Set<Term> terms = new HashSet<Term>();
        query.extractTerms(terms);

        Iterator<Term> itr = terms.iterator();
        StringBuffer text = new StringBuffer("");

        while (itr.hasNext()) {
            Term term = itr.next();
            TermDocs docs = indexReader.termDocs(term);

            while (docs.next()) {
                Integer id = docs.doc();
                Document document = indexReader.document(id);

                TokenStream stream = analyzer.tokenStream("FIELDNAME", new StringReader(text.toString()));

                //Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
                //highlighter.setTextFragmenter(fragmenter);

                //String fragment = highlighter.getBestFragment(analyzer, AppConstants.CONTENT_FIELD, storedField);
                //System.out.println(storedField); 
            }
        }
    } catch (Exception e) {
        System.out.println("Exception: getResults " + e.toString());
    }

    return null;
}

From source file:iit.cs570.assign1.web.managedbean.BM25Searcher.java

@Override
public String getBestFragment(Document doc) throws IOException, InvalidTokenOffsetsException {
    Scorer scorer = new QueryScorer(query);
    Highlighter highlighter = new Highlighter(scorer);
    return highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_42), TheCrawlersConstants.TEXT,
            doc.get(TheCrawlersConstants.TEXT));
}

From source file:iit.cs570.assign1.web.managedbean.SIMBM25Searcher.java

public String getBestFragment(Document doc) throws IOException, InvalidTokenOffsetsException {
    Scorer scorer = new QueryScorer(query);
    Highlighter highlighter = new Highlighter(scorer);
    return highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_42), TheCrawlersConstants.TEXT,
            doc.get(TheCrawlersConstants.TEXT));
}

From source file:net.chwise.documents.HighlightedFragmentsRetriever.java

License:Open Source License

public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String fieldName,
        String fieldContents, int fragmentNumber, int fragmentSize)
        throws IOException, InvalidTokenOffsetsException {

    TokenStream stream = TokenSources.getTokenStream(fieldName, fieldContents, analyzer);
    QueryScorer scorer = new QueryScorer(query, fieldName);
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentSize);

    Highlighter highlighter = new Highlighter(scorer);
    highlighter.setTextFragmenter(fragmenter);
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

    String[] fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);

    if (fragments.length == 0) {
        //Return starting piece of fieldContents fragment
        fragments = new String[1];
        fragments[0] = fieldContents.substring(0, Math.min(fragmentSize, fieldContents.length()));
    }/* w w  w  .j  av a 2  s  .  c  o m*/

    return fragments;
}

From source file:net.paoding.analysis.TestPaodingAnalyzer.java

License:Apache License

@Test
public void testHighlighting() throws Exception {

    Analyzer a = new PaodingAnalyzer();
    QueryParser parser = new QueryParser(Version.LUCENE_46, "f", a);

    Query q = parser.parse("domnick");
    String txt = "Domnick Hunter 0.01m , ?OIL-X Plus";

    Highlighter highlighter = new Highlighter(new QueryScorer(q));
    String resp = highlighter.getBestFragment(a.tokenStream("f", txt), txt);

    assertTrue(resp + " is not correctly highlighted", resp.contains("<B>Domnick</B>"));

}

From source file:org.opencms.search.documents.CmsTermHighlighterHtml.java

License:Open Source License

/**
 * @see org.opencms.search.documents.I_CmsTermHighlighter#getExcerpt(org.apache.lucene.document.Document, org.opencms.search.CmsSearchIndex, org.opencms.search.CmsSearchParameters, org.apache.lucene.search.Query, org.apache.lucene.analysis.Analyzer)
 *//*  ww  w  .jav  a2s  .  co  m*/
public String getExcerpt(Document doc, CmsSearchIndex index, CmsSearchParameters params, Query query,
        Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {

    if ((doc == null) || (index == null) || (params == null) || (analyzer == null) || (query == null)) {
        return null;
    }
    Highlighter highlighter = null;
    Iterator<String> excerptFieldNames = index.getFieldConfiguration().getExcerptFieldNames().iterator();
    StringBuffer excerptBuffer = new StringBuffer();
    while (excerptFieldNames.hasNext()) {
        String fieldName = excerptFieldNames.next();
        boolean createExcerpt = !params.isExcerptOnlySearchedFields() || params.getFields().contains(fieldName);
        if (createExcerpt && (doc.getFieldable(fieldName) != null)) {
            // only generate field excerpt if the field is available in the document
            String text = doc.getFieldable(fieldName).stringValue();
            // make sure all XML in the text is escaped, otherwise excerpt HTML output may be garbled
            text = CmsEncoder.escapeXml(text);

            TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));

            if (params.isExcerptOnlySearchedFields()) {
                // highlight the search query only in the matching fields 
                highlighter = new Highlighter(new QueryScorer(query, fieldName));
            } else {
                // highlight search query in all fields
                if (highlighter == null) {
                    highlighter = new Highlighter(new QueryScorer(query));
                }
            }
            String fragment = highlighter.getBestFragments(stream, text, EXCERPT_REQUIRED_FRAGMENTS,
                    EXCERPT_FRAGMENT_SEPARATOR);

            // kill all unwanted chars in the excerpt
            fragment = fragment.replace('\t', ' ');
            fragment = fragment.replace('\n', ' ');
            fragment = fragment.replace('\r', ' ');
            fragment = fragment.replace('\f', ' ');

            if (excerptBuffer.length() > 0) {
                // this is not the first fragment
                excerptBuffer.append(EXCERPT_FRAGMENT_SEPARATOR);
            }
            excerptBuffer.append(fragment);
        }
    }

    String result = null;
    if (excerptBuffer.length() > 0) {
        result = excerptBuffer.toString();
    }

    int maxLength = OpenCms.getSearchManager().getMaxExcerptLength();
    if ((result != null) && (result.length() > maxLength)) {
        result = result.substring(0, maxLength);
    }

    return result;
}