Example usage for org.apache.lucene.search.highlight Highlighter Highlighter

List of usage examples for org.apache.lucene.search.highlight Highlighter Highlighter

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter Highlighter.

Prototype

public Highlighter(Scorer fragmentScorer) 

Source Link

Usage

From source file:de.hsmannheim.ss15.alr.searchengine.DefaultLuceneController.java

public List<StoredDocument> doSearch(String queryString) throws IOException, ParseException {
    String field = "contents";
    String queries = null;/*from   w w  w. jav a2  s .c o  m*/
    boolean raw = false;
    int hitsPerPage = 10;

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer();

    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(field, analyzer);

    Query query = parser.parse(queryString);

    Highlighter highlighter = new Highlighter(new QueryScorer(query));

    TotalHitCountCollector collector = new TotalHitCountCollector();
    searcher.search(query, collector);
    TopDocs topDocs = searcher.search(query, Math.max(1, collector.getTotalHits()));

    List<StoredDocument> results = new ArrayList<>();
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
        StoredDocument doc = searcher.doc(scoreDoc.doc);
        try {
            File file = new File(doc.get("path"));
            BufferedReader docReader = new BufferedReader(
                    new InputStreamReader(Files.newInputStream(file.toPath()), StandardCharsets.UTF_8));

            List<String> lines = new ArrayList<>();
            while (docReader.ready()) {
                lines.add(docReader.readLine());
            }
            lines.remove(0);
            lines.remove(0);
            lines.remove(0);

            String content = "";

            for (String s : lines) {
                content = content + s;
            }
            String highLight = highlighter.getBestFragment(analyzer, null, content);
            if (highLight == null) {
                LOGGER.warn("No Highlight found");
            } else {
                doc.add(new TextField("highlight", highLight, Field.Store.YES));
            }
        } catch (InvalidTokenOffsetsException ex) {
            LOGGER.warn("No Highlight found");
        }

        results.add(doc);
    }

    reader.close();
    return results;

}

From source file:de.spartusch.nasfvi.server.NSearcher.java

License:Apache License

/**
 * Extracts a field's values from a document. This method is aware of
 * <i>collapsed</i> or <i>merged</i> fields and handles them properly. 
 * @param nquery NQuery used for searching
 * @param doc Document to extract the field's values from
 * @param field Name of the field to extract values for
 * @return Set of extracted values/*from w  w  w. ja  v  a2s  .co  m*/
 */
private Set<String> extractValues(final NQuery nquery, final Document doc, final String field) {
    Set<String> values = new HashSet<String>();

    if (NQuery.isFieldToCollapse(field)) {
        // process merged field
        String mfield = NQuery.getMergedField();
        QueryScorer scorer = new QueryScorer(nquery.getQuery(), mfield);
        Highlighter highlighter = new Highlighter(scorer);
        highlighter.setTextFragmenter(new NullFragmenter());

        try {
            Set<String> buffer = new HashSet<String>();

            for (Fieldable f : doc.getFieldables(mfield)) {
                String content = f.stringValue();
                String value = normalizeValue(NQuery.extractValue(field, content));

                // Test if the field was matched by the query
                TokenStream ts = TokenSources.getTokenStream(mfield, content, nquery.getAnalyzer());
                if (highlighter.getBestFragment(ts, content) != null) {
                    values.add(value);
                } else {
                    // Buffer the value - in case no field matches
                    buffer.add(value);
                }
            }

            if (values.isEmpty()) {
                // No field was matched by the query
                values.addAll(buffer);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        } catch (InvalidTokenOffsetsException e) {
            throw new RuntimeException(e);
        }
    } else {
        for (String v : doc.getValues(field)) {
            values.add(normalizeValue(v));
        }
    }

    return values;
}

From source file:docet.engine.SimpleDocetDocSearcher.java

License:Apache License

@Override
public List<DocetPage> searchForMatchingDocuments(final String searchText, final String lang,
        final int maxNumResults) throws DocetDocumentSearchException {
    final List<DocetPage> results = new ArrayList<>();
    final String fallbackLang = this.getFallbackLangForLang(lang);
    final String actualSearchLang;
    if (fallbackLang.isEmpty()) {
        actualSearchLang = lang;/*from  w ww  .j  av  a 2  s.  c  o  m*/
    } else {
        actualSearchLang = fallbackLang;
    }
    try {
        final IndexSearcher searcher = new IndexSearcher(reader);
        final Analyzer analyzer = new AnalyzerBuilder().language(actualSearchLang).build();
        QueryParser queryParser = new QueryParser(LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang, analyzer);
        final Query query = queryParser.parse(constructLucenePhraseTermSearchQuery(searchText));
        final QueryScorer queryScorer = new QueryScorer(query, LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang);

        final Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        final Highlighter highlighter = new Highlighter(queryScorer);
        highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
        highlighter.setTextFragmenter(fragmenter);

        final TopDocs res = searcher.search(query, maxNumResults);
        final float maxScore = res.getMaxScore();
        final List<ScoreDoc> scoreDocs = Arrays.asList(res.scoreDocs);
        Map<org.apache.lucene.document.Document, String> docs = new HashMap<>();
        Map<String, ScoreDoc> scoresForDocs = new HashMap<>();
        for (final ScoreDoc sd : scoreDocs) {
            final org.apache.lucene.document.Document doc = searcher.doc(sd.doc);
            final String contents = doc.get(LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang);
            final String docId = doc.get("id");
            final String[] fragments = highlighter.getBestFragments(analyzer,
                    LUCENE_QUERY_CONTENT_PREFIX + actualSearchLang, contents, MAX_NUM_FRAGMENTS);
            List<String> fragmentList = Arrays.asList(fragments);
            fragmentList = fragmentList.stream().map(s1 -> s1.trim().split("\n"))
                    .map(s1 -> Arrays.asList(s1).stream().filter(s -> !s.trim().isEmpty())
                            .reduce((sa, sb) -> sa + MACHING_EXCERPTS_SEPARATOR + sb)
                            .orElse(MACHING_EXCERPTS_SEPARATOR))
                    .collect(Collectors.toList());
            docs.put(doc,
                    MACHING_EXCERPTS_SEPARATOR
                            + fragmentList.stream().filter(s -> !s.isEmpty())
                                    .reduce((s1, s2) -> s1 + "..." + s2).orElse("")
                            + MACHING_EXCERPTS_SEPARATOR);
            scoresForDocs.putIfAbsent(docId, sd);
        }
        docs.entrySet().stream().forEach(e -> {
            final int relevance = Math.round((scoresForDocs.get(e.getKey().get("id")).score / maxScore) * 100);
            results.add(DocetPage.toDocetDocument(e.getKey(), e.getValue(), relevance));
        });
        return results;
    } catch (ParseException | IOException | InvalidTokenOffsetsException ex) {
        throw new DocetDocumentSearchException(
                "Error on searching query " + searchText + " for lang " + actualSearchLang, ex);
    }
}

From source file:drakkar.mast.retrieval.LuceneContext.java

/**
 * Para la sumarizacin/*from  www.j a  v a 2s  .com*/
 *
 * @return
 */
private String getHighlighter(Query q, Analyzer a, String text, String field) {

    String summary = null;

    this.hg = new Highlighter(new QueryTermScorer(q));
    this.hg.setTextFragmenter(new SimpleFragmenter(20));
    this.hg.setMaxDocCharsToAnalyze(600);

    try {
        try {
            this.tokens = TokenSources.getTokenStream(field, text, a);
            summary = this.hg.getBestFragments(this.tokens, text, 20, "...");
            // summary = this.hg.getBestFragments(this.tokens, text, 10).toString();
        } catch (IOException ex) {
            OutputMonitor.printStream("IO", ex);
        }
    } catch (InvalidTokenOffsetsException ex) {
        OutputMonitor.printStream("", ex);
    }

    if (summary == null) {
        summary = " ";
    }
    return summary;
}

From source file:engine.easy.search.EasySearchEngine.java

License:Apache License

public String highlightedText() {

    try {/*from  w ww  .  j av  a  2  s.  c  o  m*/
        Analyzer analyzer = new EasySearchAnalyzer();

        PhraseQuery phraseQuery = new PhraseQuery();
        phraseQuery.add(new Term("CONTENT", "KENNEDY"));
        phraseQuery.add(new Term("CONTENT", "ADMINISTRATION"));

        Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH));
        IndexReader indexReader = IndexReader.open(indexDir);

        Query query = getQuery(phraseQuery.toString());
        QueryScorer scorer = new QueryScorer(query, AppConstants.CONTENT_FIELD);
        Highlighter highlighter = new Highlighter(scorer);

        Set<Term> terms = new HashSet<Term>();
        query.extractTerms(terms);

        Iterator<Term> itr = terms.iterator();
        StringBuffer text = new StringBuffer("");

        while (itr.hasNext()) {
            Term term = itr.next();
            TermDocs docs = indexReader.termDocs(term);

            while (docs.next()) {
                Integer id = docs.doc();
                Document document = indexReader.document(id);

                TokenStream stream = analyzer.tokenStream("FIELDNAME", new StringReader(text.toString()));

                //Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
                //highlighter.setTextFragmenter(fragmenter);

                //String fragment = highlighter.getBestFragment(analyzer, AppConstants.CONTENT_FIELD, storedField);
                //System.out.println(storedField); 
            }
        }
    } catch (Exception e) {
        System.out.println("Exception: getResults " + e.toString());
    }

    return null;
}

From source file:iit.cs570.assign1.web.managedbean.BM25Searcher.java

@Override
public String getBestFragment(Document doc) throws IOException, InvalidTokenOffsetsException {
    Scorer scorer = new QueryScorer(query);
    Highlighter highlighter = new Highlighter(scorer);
    return highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_42), TheCrawlersConstants.TEXT,
            doc.get(TheCrawlersConstants.TEXT));
}

From source file:iit.cs570.assign1.web.managedbean.SIMBM25Searcher.java

public String getBestFragment(Document doc) throws IOException, InvalidTokenOffsetsException {
    Scorer scorer = new QueryScorer(query);
    Highlighter highlighter = new Highlighter(scorer);
    return highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_42), TheCrawlersConstants.TEXT,
            doc.get(TheCrawlersConstants.TEXT));
}

From source file:net.chwise.documents.HighlightedFragmentsRetriever.java

License:Open Source License

public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String fieldName,
        String fieldContents, int fragmentNumber, int fragmentSize)
        throws IOException, InvalidTokenOffsetsException {

    TokenStream stream = TokenSources.getTokenStream(fieldName, fieldContents, analyzer);
    QueryScorer scorer = new QueryScorer(query, fieldName);
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentSize);

    Highlighter highlighter = new Highlighter(scorer);
    highlighter.setTextFragmenter(fragmenter);
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

    String[] fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);

    if (fragments.length == 0) {
        //Return starting piece of fieldContents fragment
        fragments = new String[1];
        fragments[0] = fieldContents.substring(0, Math.min(fragmentSize, fieldContents.length()));
    }/* w w  w  .j  av a 2  s  .  c  o m*/

    return fragments;
}

From source file:net.paoding.analysis.TestPaodingAnalyzer.java

License:Apache License

@Test
public void testHighlighting() throws Exception {

    Analyzer a = new PaodingAnalyzer();
    QueryParser parser = new QueryParser(Version.LUCENE_46, "f", a);

    Query q = parser.parse("domnick");
    String txt = "Domnick Hunter 0.01m , ?OIL-X Plus";

    Highlighter highlighter = new Highlighter(new QueryScorer(q));
    String resp = highlighter.getBestFragment(a.tokenStream("f", txt), txt);

    assertTrue(resp + " is not correctly highlighted", resp.contains("<B>Domnick</B>"));

}

From source file:org.opencms.search.documents.CmsTermHighlighterHtml.java

License:Open Source License

/**
 * @see org.opencms.search.documents.I_CmsTermHighlighter#getExcerpt(org.apache.lucene.document.Document, org.opencms.search.CmsSearchIndex, org.opencms.search.CmsSearchParameters, org.apache.lucene.search.Query, org.apache.lucene.analysis.Analyzer)
 *//*  ww  w  .jav  a2s  .  co  m*/
public String getExcerpt(Document doc, CmsSearchIndex index, CmsSearchParameters params, Query query,
        Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {

    if ((doc == null) || (index == null) || (params == null) || (analyzer == null) || (query == null)) {
        return null;
    }
    Highlighter highlighter = null;
    Iterator<String> excerptFieldNames = index.getFieldConfiguration().getExcerptFieldNames().iterator();
    StringBuffer excerptBuffer = new StringBuffer();
    while (excerptFieldNames.hasNext()) {
        String fieldName = excerptFieldNames.next();
        boolean createExcerpt = !params.isExcerptOnlySearchedFields() || params.getFields().contains(fieldName);
        if (createExcerpt && (doc.getFieldable(fieldName) != null)) {
            // only generate field excerpt if the field is available in the document
            String text = doc.getFieldable(fieldName).stringValue();
            // make sure all XML in the text is escaped, otherwise excerpt HTML output may be garbled
            text = CmsEncoder.escapeXml(text);

            TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));

            if (params.isExcerptOnlySearchedFields()) {
                // highlight the search query only in the matching fields 
                highlighter = new Highlighter(new QueryScorer(query, fieldName));
            } else {
                // highlight search query in all fields
                if (highlighter == null) {
                    highlighter = new Highlighter(new QueryScorer(query));
                }
            }
            String fragment = highlighter.getBestFragments(stream, text, EXCERPT_REQUIRED_FRAGMENTS,
                    EXCERPT_FRAGMENT_SEPARATOR);

            // kill all unwanted chars in the excerpt
            fragment = fragment.replace('\t', ' ');
            fragment = fragment.replace('\n', ' ');
            fragment = fragment.replace('\r', ' ');
            fragment = fragment.replace('\f', ' ');

            if (excerptBuffer.length() > 0) {
                // this is not the first fragment
                excerptBuffer.append(EXCERPT_FRAGMENT_SEPARATOR);
            }
            excerptBuffer.append(fragment);
        }
    }

    String result = null;
    if (excerptBuffer.length() > 0) {
        result = excerptBuffer.toString();
    }

    int maxLength = OpenCms.getSearchManager().getMaxExcerptLength();
    if ((result != null) && (result.length() > maxLength)) {
        result = result.substring(0, maxLength);
    }

    return result;
}