Example usage for org.apache.lucene.search.highlight Highlighter getBestFragment

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter getBestFragment.

Prototype

public final String getBestFragment(Analyzer analyzer, String fieldName, String text)
        throws IOException, InvalidTokenOffsetsException

Source Link

Document

Highlights chosen terms in a text, extracting the most relevant section.

Usage

From source file:iit.cs570.assign1.web.managedbean.BM25Searcher.java

@Override
public String getBestFragment(Document doc) throws IOException, InvalidTokenOffsetsException {
    Scorer scorer = new QueryScorer(query);
    Highlighter highlighter = new Highlighter(scorer);
    return highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_42), TheCrawlersConstants.TEXT,
            doc.get(TheCrawlersConstants.TEXT));
}

From source file:iit.cs570.assign1.web.managedbean.SIMBM25Searcher.java

public String getBestFragment(Document doc) throws IOException, InvalidTokenOffsetsException {
    Scorer scorer = new QueryScorer(query);
    Highlighter highlighter = new Highlighter(scorer);
    return highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_42), TheCrawlersConstants.TEXT,
            doc.get(TheCrawlersConstants.TEXT));
}

From source file:natural.language.qa.LuceneSearch.java

License:Apache License

public List<LuceneSearchResult> search(String queryString, int maxRes) throws Exception {
    IndexSearcher searcher = null;//w  ww  .j  a  v  a  2  s.c o  m
    List<LuceneSearchResult> results = new ArrayList<LuceneSearchResult>();
    try {
        Properties indexConf = new Properties();
        FileInputStream fis = new FileInputStream("index.properties");
        indexConf.load(fis);

        String index = indexConf.getProperty("index");
        String field = "contents";

        Directory indexDir = FSDirectory.open(new File(index));

        searcher = new IndexSearcher(indexDir);
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);

        QueryParser parser = new QueryParser(Version.LUCENE_31, field, analyzer);

        queryString = queryString.trim();
        if (queryString.length() == 0) {
            return results;
        }

        Query query = parser.parse(queryString);
        System.out.println("Searching for: " + query.toString(field));

        // ================================================
        Formatter f = new SimpleHTMLFormatter("", "");
        Encoder e = new DefaultEncoder();
        QueryScorer fs = new QueryScorer(query);
        Fragmenter fragmenter = new SimpleSpanFragmenter(fs, 50);// new SentenceFragmenter();
        Highlighter h = new Highlighter(f, e, fs);
        h.setTextFragmenter(fragmenter);
        h.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

        // ================================================

        // Collect docs
        TopDocs res = searcher.search(query, maxRes);
        int numTotalHits = res.totalHits;
        ScoreDoc[] scoreDocs = res.scoreDocs;

        for (ScoreDoc scoreDoc : scoreDocs) {
            Document doc = searcher.doc(scoreDoc.doc);
            String path = doc.get("path");
            String content = readDocument(path);
            String bestFragment = h.getBestFragment(analyzer, field, content);
            String frag = bestFragment;
            //System.out.println(frag);
            LuceneSearchResult hit = new LuceneSearchResult(scoreDoc.doc, path, frag);
            results.add(hit);
        }
        System.out.println(numTotalHits + " total matching documents");
    } finally {
        if (searcher != null) {
            searcher.close();
        }
    }
    return results;
}

From source file:net.sourceforge.docfetcher.model.search.HighlightService.java

License:Open Source License

@MutableCopy
@NotNull/*from  w  w  w .j a  v a  2 s.  co  m*/
private static List<Range> highlight(@NotNull Query query, @NotNull String text)
        throws CheckedOutOfMemoryError {
    final List<Range> ranges = new ArrayList<Range>();
    /*
     * A formatter is supposed to return formatted text, but since we're
     * only interested in the start and end offsets of the search terms, we
     * return null and store the offsets in a list.
     */
    Formatter nullFormatter = new Formatter() {
        public String highlightTerm(String originalText, TokenGroup tokenGroup) {
            for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                Token token = tokenGroup.getToken(i);
                if (tokenGroup.getScore(i) == 0)
                    continue;
                int start = token.startOffset();
                int end = token.endOffset();
                ranges.add(new Range(start, end - start));
            }
            return null;
        }
    };
    String key = Fields.CONTENT.key();
    Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    highlighter.setTextFragmenter(new NullFragmenter());
    try {
        /*
         * This has a return value, but we ignore it since we only want the
         * offsets. Might throw an OutOfMemoryError.
         */
        highlighter.getBestFragment(IndexRegistry.getAnalyzer(), key, text);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } catch (Exception e) {
        Util.printErr(e);
    }
    return ranges;
}

From source file:net.sourceforge.docfetcher.view.PreviewPanel.java

License:Open Source License

/**
 * Sets the file to be displayed, using <tt>parser</tt> to extract the
 * text from the file on the disk. This method does nothing if the given
 * file is null. The <tt>force</tt> parameter specifies whether the
 * preview should be updated even if neither the file nor the search terms
 * have changed in the meantime.//from  w w  w  . java  2  s .c o  m
 */
private void setFile(final File file, final Parser parser, final Query query, boolean force) {
    File lastFile = this.file;
    Query lastQuery = this.query;
    this.file = file;
    this.parser = parser;
    this.query = query;

    // Check input
    if (file == null)
        return;
    if (parser == null) // Allowed to be null if file is null, too
        throw new IllegalArgumentException();
    if (!isActive)
        return;
    if (file.equals(lastFile) && !force)
        if (query != null && query.equals(lastQuery))
            return;

    if (file.isDirectory())
        throw new IllegalStateException("File expected for preview, got directory instead."); //$NON-NLS-1$
    if (!file.exists()) {
        textViewer.setText(Msg.file_not_found.value());
        showViewer(textViewerContainer);
        return;
    }

    // Use the HTML browser
    if (file.getAbsolutePath().equals(Const.HELP_FILE) || Pref.Bool.PreviewHTML.getValue()) {
        final BrowserPanel browser = browserProvider.getBrowser(previewPanel, browserToolBar, parser);
        if (browser != null) {
            browser.addProgressListener(new ProgressAdapter() {
                public void completed(ProgressEvent event) {
                    showViewer(browser);
                    upBt.setEnabled(false);
                    downBt.setEnabled(false);
                    occurrenceCounter.setText("0"); //$NON-NLS-1$
                }
            });
            browser.setFile(file);
            return;
        }
        // Browser creation failed, go on to next code block
    }

    // Use text renderers
    showViewer(textViewerContainer);

    // Use monospace font for text files
    if (parser instanceof TextParser) {
        org.eclipse.swt.graphics.Font monoFont = Font.PREVIEW_MONO.getFont();
        if (!textViewer.getFont().equals(monoFont))
            textViewer.setFont(monoFont);
    } else {
        org.eclipse.swt.graphics.Font previewFont = Font.PREVIEW.getFont();
        if (!textViewer.getFont().equals(previewFont))
            textViewer.setFont(previewFont);
    }

    textViewer.setText(Msg.loading.value()); // display loading message

    new Thread() { // run in a thread because parsing the file takes some time
        public void run() {
            // Extract the raw text from the file
            String text;
            boolean fileParsed = true;
            try {
                text = parser.renderText(file);
            } catch (ParseException e) {
                text = Msg.cant_read_file.format(e.getMessage());
                fileParsed = false;
            } catch (OutOfMemoryError e) {
                /*
                 * We can get here if the user sets a high java heap space
                 * value during indexing and then sets a lower value for
                 * search only usage.
                 */
                text = Msg.out_of_jvm_memory.value();
                fileParsed = false;
            }

            if (PreviewPanel.this.file != file)
                return; // Another preview request had been started while we were parsing

            /*
             * Create the message that will be displayed if the character limit
             * is reached. It is appended to the file contents later; if it
             * was appended here, some words in it might get highlighted.
             */
            int maxLength = Pref.Int.PreviewLimit.getValue();
            final String msg = "...\n\n\n[" //$NON-NLS-1$
                    + Msg.preview_limit_hint.format(new Object[] { maxLength, Pref.Int.PreviewLimit.name(),
                            Const.USER_PROPERTIES_FILENAME })
                    + "]"; //$NON-NLS-1$
            final boolean exceeded = text.length() > maxLength;
            if (text.length() > maxLength)
                text = text.substring(0, maxLength - msg.length());
            final String fText = text;

            /*
             * Create StyleRange ranges (i.e. start-end integer pairs) for
             * search term highlighting. Only tokenize preview text if we're
             * not displaying any info messages and if there are tokens to
             * highlight.
             */
            ranges = new int[0];
            if (fileParsed && query != null) {
                final List<Integer> rangesList = new ArrayList<Integer>();
                Analyzer analyzer = RootScope.analyzer;

                /*
                 * A formatter is supposed to return formatted text, but
                 * since we're only interested in the start and end offsets
                 * of the search terms, we return null and store the offsets
                 * in a list.
                 */
                Formatter nullFormatter = new Formatter() {
                    public String highlightTerm(String originalText, TokenGroup tokenGroup) {
                        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                            Token token = tokenGroup.getToken(i);
                            if (tokenGroup.getScore(i) == 0)
                                continue;
                            int start = token.startOffset();
                            int end = token.endOffset();
                            rangesList.add(start);
                            rangesList.add(end - start);
                        }
                        return null;
                    }
                };

                Highlighter highlighter = new Highlighter(nullFormatter,
                        new QueryScorer(query, Document.contents));
                highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
                highlighter.setTextFragmenter(new NullFragmenter());
                try {
                    /*
                     * This has a return value, but we ignore it since we
                     * only want the offsets.
                     */
                    highlighter.getBestFragment(analyzer, Document.contents, fText);
                } catch (Exception e) {
                    // We can do without the search term highlighting
                }

                // List to array (will be used by the method 'setHighlighting(..)')
                ranges = new int[rangesList.size()];
                for (int i = 0; i < ranges.length; i++)
                    ranges[i] = rangesList.get(i);
            }

            // Parsing and tokenizing done; display the results
            final boolean fFileParsed = fileParsed;
            Display.getDefault().syncExec(new Runnable() {
                public void run() {
                    // Enable or disable up and down buttons
                    upBt.setEnabled(ranges.length != 0);
                    downBt.setEnabled(ranges.length != 0);

                    textViewer.setText(fText);
                    setHighlighting(fFileParsed && Pref.Bool.HighlightSearchTerms.getValue());
                    occurrenceCounter.setText(Integer.toString(ranges.length / 2));
                    if (exceeded)
                        textViewer.append(msg); // character limit exceeded, append hint
                }
            });
        }
    }.start();
}

From source file:net.sourceforge.vaticanfetcher.model.search.HighlightService.java

License:Open Source License

@MutableCopy
@NotNull/*  w  w w .  ja  v a 2s.  c  om*/
private static List<Range> highlight(@NotNull Query query, @NotNull String text)
        throws CheckedOutOfMemoryError {
    final List<Range> ranges = new ArrayList<Range>();
    /*
     * A formatter is supposed to return formatted text, but since we're
     * only interested in the start and end offsets of the search terms, we
     * return null and store the offsets in a list.
     */
    Formatter nullFormatter = new Formatter() {
        public String highlightTerm(String originalText, TokenGroup tokenGroup) {
            for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                Token token = tokenGroup.getToken(i);
                if (tokenGroup.getScore(i) == 0)
                    continue;
                int start = token.startOffset();
                int end = token.endOffset();
                ranges.add(new Range(start, end - start));
            }
            return null;
        }
    };
    String key = Fields.CONTENT.key();
    Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    highlighter.setTextFragmenter(new NullFragmenter());
    try {
        /*
         * This has a return value, but we ignore it since we only want the
         * offsets. Might throw an OutOfMemoryError.
         */
        highlighter.getBestFragment(IndexRegistry.analyzer, key, text);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } catch (Exception e) {
        Util.printErr(e);
    }
    return ranges;
}

From source file:top.sj.lucene.LuceneSearchUtil.java

License:Open Source License

/**
 * ???//w  w  w .jav  a2s .  c  o  m
 * 
 * @param primaryKeyByHibernateEntity
 *            Hibernate??
 * @param analysisTarget
 *            ?????
 * @param analysisCondition
 *            ????
 * @return ????
 * @throws IOException
 * @throws ParseException
 * @throws InvalidTokenOffsetsException
 */
public static List<LuceneSearchDTO> searchOfSingleAreaAndSingleCondition(String primaryKeyByHibernateEntity,
        String analysisTarget, String analysisCondition)
        throws IOException, ParseException, InvalidTokenOffsetsException {

    String configPath = PropertiesTool.getPropertiesFileAsObject("lucene_config.properties")
            .getProperty("index_location");
    Directory dir = null;
    try {
        dir = FSDirectory.open(new File(configPath));
    } catch (Exception e) {
        e.printStackTrace();
    }

    // Directory dir = FSDirectory.open(new File("D:\\lucene"));

    IndexSearcher searcher = new IndexSearcher(dir);

    QueryParser parser = new QueryParser(Version.LUCENE_30, analysisTarget,
            new StandardAnalyzer(Version.LUCENE_30));

    // ??
    Query query = parser.parse(analysisCondition);

    TopDocs topDocs = searcher.search(query, MAX_SEARCH_RESULT);

    /**
     * 
     */
    QueryScorer queryScorer = new QueryScorer(query);
    Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
    Formatter formatter = new SimpleHTMLFormatter("<b>", "<b/>");
    Highlighter highlighter = new Highlighter(formatter, queryScorer);
    highlighter.setTextFragmenter(fragmenter);

    List<LuceneSearchDTO> analysisResults = new ArrayList<LuceneSearchDTO>();

    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
        int docId = topDocs.scoreDocs[i].doc;
        Document doc = searcher.doc(docId);
        String attr = highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30), analysisTarget,
                doc.get(analysisTarget));
        analysisResults.add(new LuceneSearchDTO(Integer.valueOf(doc.get(primaryKeyByHibernateEntity)), attr));
    }
    return analysisResults;
}

From source file:top.sj.lucene.LuceneSearchUtil.java

License:Open Source License

/**
 * ??//  w  ww  .j av a 2  s.com
 * 
 * @param primaryKeyByHibernateEntity
 *            Hibernate??
 * @param analysisTarget
 *            ?????
 * @param analysisConditions
 *            ?????????????
 * @return ????
 * @throws IOException
 * @throws ParseException
 * @throws InvalidTokenOffsetsException
 */
public static List<LuceneSearchDTO> searchOfSingleAreaAndMultiCondition(String primaryKeyByHibernateEntity,
        String analysisTarget, String... analysisConditions)
        throws IOException, ParseException, InvalidTokenOffsetsException {
    String configPath = PropertiesTool.getPropertiesFileAsObject("lucene_config.properties")
            .getProperty("index_location");
    Directory dir = FSDirectory.open(new File(configPath));

    // Directory dir = FSDirectory.open(new File("D://lucene"));
    IndexSearcher searcher = new IndexSearcher(dir);

    QueryParser parser = new QueryParser(Version.LUCENE_30, analysisTarget,
            new StandardAnalyzer(Version.LUCENE_30));

    BooleanQuery query = new BooleanQuery();

    for (int i = 0; i < analysisConditions.length; i++) {
        Query query1 = parser.parse(analysisConditions[i]);
        query.add(query1, i == 0 ? Occur.MUST : Occur.SHOULD);
    }
    TopDocs topDocs = searcher.search(query, MAX_SEARCH_RESULT);

    /**
     * 
     */
    QueryScorer queryScorer = new QueryScorer(query);
    Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
    Formatter formatter = new SimpleHTMLFormatter("<b>", "<b/>");
    Highlighter highlighter = new Highlighter(formatter, queryScorer);
    highlighter.setTextFragmenter(fragmenter);

    List<LuceneSearchDTO> analysisResults = new ArrayList<LuceneSearchDTO>();

    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
        int docId = topDocs.scoreDocs[i].doc;
        Document doc = searcher.doc(docId);
        String attr = highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30), analysisTarget,
                doc.get(analysisTarget));
        analysisResults.add(new LuceneSearchDTO(Integer.valueOf(doc.get(primaryKeyByHibernateEntity)), attr));
    }
    return analysisResults;
}

From source file:uk.ac.ebi.arrayexpress.utils.saxon.search.QueryHighlighter.java

License:Apache License

public String highlightQuery(QueryInfo queryInfo, String fieldName, String text) {
    try {/*from   w ww .j a va 2s  .  c  om*/
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(HIT_OPEN_MARK, HIT_CLOSE_MARK);
        Highlighter highlighter = new Highlighter(htmlFormatter,
                new QueryScorer(queryInfo.getQuery(), fieldName, this.env.defaultField));
        highlighter.setTextFragmenter(new NullFragmenter());

        String str = highlighter.getBestFragment(this.env.indexAnalyzer,
                "".equals(fieldName) ? this.env.defaultField : fieldName, text);

        return null != str ? str : text;
    } catch (Exception x) {
        logger.error("Caught an exception:", x);
    }
    return text;
}

From source file:uk.ac.ebi.arrayexpress.utils.search.EFOExpandedHighlighter.java

License:Apache License

private String doHighlightQuery(Query query, String fieldName, String text, String openMark, String closeMark) {
    try {/*www .j a v a  2  s.c  om*/
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(openMark, closeMark);
        Highlighter highlighter = new Highlighter(htmlFormatter,
                new QueryScorer(query, fieldName, this.env.defaultField));
        highlighter.setTextFragmenter(new NullFragmenter());

        String str = highlighter.getBestFragment(this.env.indexAnalyzer,
                "".equals(fieldName) ? this.env.defaultField : fieldName, text);

        return null != str ? str : text;
    }

    catch (Exception x) {
        logger.error("Caught an exception:", x);
    }
    return text;

}