Example usage for org.apache.lucene.search.highlight Highlighter setTextFragmenter

List of usage examples for org.apache.lucene.search.highlight Highlighter setTextFragmenter

Introduction

In this page you can find the example usage for org.apache.lucene.search.highlight Highlighter setTextFragmenter.

Prototype

public void setTextFragmenter(Fragmenter fragmenter) 

Source Link

Usage

From source file:lius.search.LiusHitList.java

License:Apache License

private LiusHit buildLiusHit(int index) throws IOException {

    LiusHit liusHit = new LiusHit();
    liusHit.setScore(luceneHits.score(index));
    liusHit.setDocId(luceneHits.id(index));

    Document luceneDocument = luceneHits.doc(index);

    Map liusHitFieldsMap = new HashMap();
    List liusFieldsList = new ArrayList();
    Highlighter luceneHighlighter = null;

    if (liusConfig.getHighlighter() == true) {
        IndexReader luceneIndexReader = IndexReader.open(indexDirectory);

        Query rewrittenLuceneQuery = luceneQuery.rewrite(luceneIndexReader);
        QueryScorer luceneScorer = new QueryScorer(rewrittenLuceneQuery);

        SimpleHTMLFormatter luceneFormatter = new SimpleHTMLFormatter("<span class=\"liusHit\">", "</span>");
        luceneHighlighter = new Highlighter(luceneFormatter, luceneScorer);
    }//www.  ja  v a 2  s.c  om

    for (int j = 0; j < liusConfig.getDisplayFields().size(); j++) {
        LiusField configLiusField = (LiusField) liusConfig.getDisplayFields().get(j);
        LiusField hitLiusField = new LiusField();
        String fieldName = configLiusField.getName();

        hitLiusField.setName(fieldName);
        hitLiusField.setLabel(configLiusField.getLabel());

        if (luceneHighlighter != null) {
            Fragmenter luceneFragmenter;
            if (configLiusField.getFragmenter() != null) {
                luceneFragmenter = new SimpleFragmenter(Integer.parseInt(configLiusField.getFragmenter()));
            } else {
                luceneFragmenter = new SimpleFragmenter(Integer.MAX_VALUE);
            }
            luceneHighlighter.setTextFragmenter(luceneFragmenter);
        }
        String[] luceneDocumentValues = luceneDocument.getValues(configLiusField.getName());
        if (luceneDocumentValues != null) {
            if (luceneHighlighter != null) {
                for (int k = 0; k < luceneDocumentValues.length; k++) {
                    Analyzer luceneAnalyzer = AnalyzerFactory.getAnalyzer(liusConfig);
                    TokenStream luceneTokenStream = luceneAnalyzer.tokenStream(configLiusField.getName(),
                            new StringReader(luceneDocumentValues[k]));
                    String fragment = null;
                    if (configLiusField.getFragmenter() != null)
                        fragment = luceneHighlighter.getBestFragments(luceneTokenStream,
                                luceneDocumentValues[k], 5, "...");
                    else {
                        fragment = luceneHighlighter.getBestFragment(luceneTokenStream,
                                luceneDocumentValues[k]);
                    }

                    if (fragment == null) {
                    } else {
                        luceneDocumentValues[k] = fragment;
                    }
                }
            }

            hitLiusField.setValue(luceneDocumentValues[0]);
            hitLiusField.setValues(luceneDocumentValues);

            liusHitFieldsMap.put(configLiusField.getName(), hitLiusField);
            liusFieldsList.add(hitLiusField);
        }

    }
    liusHit.setLiusFieldsMap(liusHitFieldsMap);
    liusHit.setLiusFields(liusFieldsList);
    return liusHit;
}

From source file:lucandra.LucandraTests.java

License:Apache License

public void testHighlight() throws Exception {

    // This tests the TermPositionVector classes

    IndexReader indexReader = new IndexReader(indexName, client);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "key", analyzer);

    // check exact
    Query q = qp.parse("+key:\"foobar foobar\"");
    TopDocs docs = searcher.search(q, 10);
    assertEquals(1, docs.totalHits);// ww  w . j  a  va 2  s.  co m

    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
    QueryScorer scorer = new QueryScorer(q, "key", text);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));

    TokenStream tvStream = TokenSources.getTokenStream(indexReader, docs.scoreDocs[0].doc, "key");

    String rv = highlighter.getBestFragment(tvStream, text);

    assertNotNull(rv);
    assertEquals(rv, highlightedText);
}

From source file:natural.language.qa.LuceneSearch.java

License:Apache License

public List<LuceneSearchResult> search(String queryString, int maxRes) throws Exception {
    IndexSearcher searcher = null;//from  w ww . ja  va  2s  .c  o m
    List<LuceneSearchResult> results = new ArrayList<LuceneSearchResult>();
    try {
        Properties indexConf = new Properties();
        FileInputStream fis = new FileInputStream("index.properties");
        indexConf.load(fis);

        String index = indexConf.getProperty("index");
        String field = "contents";

        Directory indexDir = FSDirectory.open(new File(index));

        searcher = new IndexSearcher(indexDir);
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);

        QueryParser parser = new QueryParser(Version.LUCENE_31, field, analyzer);

        queryString = queryString.trim();
        if (queryString.length() == 0) {
            return results;
        }

        Query query = parser.parse(queryString);
        System.out.println("Searching for: " + query.toString(field));

        // ================================================
        Formatter f = new SimpleHTMLFormatter("", "");
        Encoder e = new DefaultEncoder();
        QueryScorer fs = new QueryScorer(query);
        Fragmenter fragmenter = new SimpleSpanFragmenter(fs, 50);// new SentenceFragmenter();
        Highlighter h = new Highlighter(f, e, fs);
        h.setTextFragmenter(fragmenter);
        h.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

        // ================================================

        // Collect docs
        TopDocs res = searcher.search(query, maxRes);
        int numTotalHits = res.totalHits;
        ScoreDoc[] scoreDocs = res.scoreDocs;

        for (ScoreDoc scoreDoc : scoreDocs) {
            Document doc = searcher.doc(scoreDoc.doc);
            String path = doc.get("path");
            String content = readDocument(path);
            String bestFragment = h.getBestFragment(analyzer, field, content);
            String frag = bestFragment;
            //System.out.println(frag);
            LuceneSearchResult hit = new LuceneSearchResult(scoreDoc.doc, path, frag);
            results.add(hit);
        }
        System.out.println(numTotalHits + " total matching documents");
    } finally {
        if (searcher != null) {
            searcher.close();
        }
    }
    return results;
}

From source file:net.chwise.documents.HighlightedFragmentsRetriever.java

License:Open Source License

public String[] getFragmentsWithHighlightedTerms(Analyzer analyzer, Query query, String fieldName,
        String fieldContents, int fragmentNumber, int fragmentSize)
        throws IOException, InvalidTokenOffsetsException {

    TokenStream stream = TokenSources.getTokenStream(fieldName, fieldContents, analyzer);
    QueryScorer scorer = new QueryScorer(query, fieldName);
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentSize);

    Highlighter highlighter = new Highlighter(scorer);
    highlighter.setTextFragmenter(fragmenter);
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);

    String[] fragments = highlighter.getBestFragments(stream, fieldContents, fragmentNumber);

    if (fragments.length == 0) {
        //Return starting piece of fieldContents fragment
        fragments = new String[1];
        fragments[0] = fieldContents.substring(0, Math.min(fragmentSize, fieldContents.length()));
    }//from www. j ava2  s . c  o  m

    return fragments;
}

From source file:net.sourceforge.docfetcher.model.search.HighlightService.java

License:Open Source License

@MutableCopy
@NotNull//from ww w.j av  a  2 s. c  o  m
private static List<Range> highlight(@NotNull Query query, @NotNull String text)
        throws CheckedOutOfMemoryError {
    final List<Range> ranges = new ArrayList<Range>();
    /*
     * A formatter is supposed to return formatted text, but since we're
     * only interested in the start and end offsets of the search terms, we
     * return null and store the offsets in a list.
     */
    Formatter nullFormatter = new Formatter() {
        public String highlightTerm(String originalText, TokenGroup tokenGroup) {
            for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                Token token = tokenGroup.getToken(i);
                if (tokenGroup.getScore(i) == 0)
                    continue;
                int start = token.startOffset();
                int end = token.endOffset();
                ranges.add(new Range(start, end - start));
            }
            return null;
        }
    };
    String key = Fields.CONTENT.key();
    Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    highlighter.setTextFragmenter(new NullFragmenter());
    try {
        /*
         * This has a return value, but we ignore it since we only want the
         * offsets. Might throw an OutOfMemoryError.
         */
        highlighter.getBestFragment(IndexRegistry.getAnalyzer(), key, text);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } catch (Exception e) {
        Util.printErr(e);
    }
    return ranges;
}

From source file:net.sourceforge.docfetcher.view.PreviewPanel.java

License:Open Source License

/**
 * Sets the file to be displayed, using <tt>parser</tt> to extract the
 * text from the file on the disk. This method does nothing if the given
 * file is null. The <tt>force</tt> parameter specifies whether the
 * preview should be updated even if neither the file nor the search terms
 * have changed in the meantime./*from  ww w.java  2 s.  c  o m*/
 */
private void setFile(final File file, final Parser parser, final Query query, boolean force) {
    File lastFile = this.file;
    Query lastQuery = this.query;
    this.file = file;
    this.parser = parser;
    this.query = query;

    // Check input
    if (file == null)
        return;
    if (parser == null) // Allowed to be null if file is null, too
        throw new IllegalArgumentException();
    if (!isActive)
        return;
    if (file.equals(lastFile) && !force)
        if (query != null && query.equals(lastQuery))
            return;

    if (file.isDirectory())
        throw new IllegalStateException("File expected for preview, got directory instead."); //$NON-NLS-1$
    if (!file.exists()) {
        textViewer.setText(Msg.file_not_found.value());
        showViewer(textViewerContainer);
        return;
    }

    // Use the HTML browser
    if (file.getAbsolutePath().equals(Const.HELP_FILE) || Pref.Bool.PreviewHTML.getValue()) {
        final BrowserPanel browser = browserProvider.getBrowser(previewPanel, browserToolBar, parser);
        if (browser != null) {
            browser.addProgressListener(new ProgressAdapter() {
                public void completed(ProgressEvent event) {
                    showViewer(browser);
                    upBt.setEnabled(false);
                    downBt.setEnabled(false);
                    occurrenceCounter.setText("0"); //$NON-NLS-1$
                }
            });
            browser.setFile(file);
            return;
        }
        // Browser creation failed, go on to next code block
    }

    // Use text renderers
    showViewer(textViewerContainer);

    // Use monospace font for text files
    if (parser instanceof TextParser) {
        org.eclipse.swt.graphics.Font monoFont = Font.PREVIEW_MONO.getFont();
        if (!textViewer.getFont().equals(monoFont))
            textViewer.setFont(monoFont);
    } else {
        org.eclipse.swt.graphics.Font previewFont = Font.PREVIEW.getFont();
        if (!textViewer.getFont().equals(previewFont))
            textViewer.setFont(previewFont);
    }

    textViewer.setText(Msg.loading.value()); // display loading message

    new Thread() { // run in a thread because parsing the file takes some time
        public void run() {
            // Extract the raw text from the file
            String text;
            boolean fileParsed = true;
            try {
                text = parser.renderText(file);
            } catch (ParseException e) {
                text = Msg.cant_read_file.format(e.getMessage());
                fileParsed = false;
            } catch (OutOfMemoryError e) {
                /*
                 * We can get here if the user sets a high java heap space
                 * value during indexing and then sets a lower value for
                 * search only usage.
                 */
                text = Msg.out_of_jvm_memory.value();
                fileParsed = false;
            }

            if (PreviewPanel.this.file != file)
                return; // Another preview request had been started while we were parsing

            /*
             * Create the message that will be displayed if the character limit
             * is reached. It is appended to the file contents later; if it
             * was appended here, some words in it might get highlighted.
             */
            int maxLength = Pref.Int.PreviewLimit.getValue();
            final String msg = "...\n\n\n[" //$NON-NLS-1$
                    + Msg.preview_limit_hint.format(new Object[] { maxLength, Pref.Int.PreviewLimit.name(),
                            Const.USER_PROPERTIES_FILENAME })
                    + "]"; //$NON-NLS-1$
            final boolean exceeded = text.length() > maxLength;
            if (text.length() > maxLength)
                text = text.substring(0, maxLength - msg.length());
            final String fText = text;

            /*
             * Create StyleRange ranges (i.e. start-end integer pairs) for
             * search term highlighting. Only tokenize preview text if we're
             * not displaying any info messages and if there are tokens to
             * highlight.
             */
            ranges = new int[0];
            if (fileParsed && query != null) {
                final List<Integer> rangesList = new ArrayList<Integer>();
                Analyzer analyzer = RootScope.analyzer;

                /*
                 * A formatter is supposed to return formatted text, but
                 * since we're only interested in the start and end offsets
                 * of the search terms, we return null and store the offsets
                 * in a list.
                 */
                Formatter nullFormatter = new Formatter() {
                    public String highlightTerm(String originalText, TokenGroup tokenGroup) {
                        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                            Token token = tokenGroup.getToken(i);
                            if (tokenGroup.getScore(i) == 0)
                                continue;
                            int start = token.startOffset();
                            int end = token.endOffset();
                            rangesList.add(start);
                            rangesList.add(end - start);
                        }
                        return null;
                    }
                };

                Highlighter highlighter = new Highlighter(nullFormatter,
                        new QueryScorer(query, Document.contents));
                highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
                highlighter.setTextFragmenter(new NullFragmenter());
                try {
                    /*
                     * This has a return value, but we ignore it since we
                     * only want the offsets.
                     */
                    highlighter.getBestFragment(analyzer, Document.contents, fText);
                } catch (Exception e) {
                    // We can do without the search term highlighting
                }

                // List to array (will be used by the method 'setHighlighting(..)')
                ranges = new int[rangesList.size()];
                for (int i = 0; i < ranges.length; i++)
                    ranges[i] = rangesList.get(i);
            }

            // Parsing and tokenizing done; display the results
            final boolean fFileParsed = fileParsed;
            Display.getDefault().syncExec(new Runnable() {
                public void run() {
                    // Enable or disable up and down buttons
                    upBt.setEnabled(ranges.length != 0);
                    downBt.setEnabled(ranges.length != 0);

                    textViewer.setText(fText);
                    setHighlighting(fFileParsed && Pref.Bool.HighlightSearchTerms.getValue());
                    occurrenceCounter.setText(Integer.toString(ranges.length / 2));
                    if (exceeded)
                        textViewer.append(msg); // character limit exceeded, append hint
                }
            });
        }
    }.start();
}

From source file:net.sourceforge.vaticanfetcher.model.search.HighlightService.java

License:Open Source License

@MutableCopy
@NotNull/*from ww w.j  av a  2 s. co  m*/
private static List<Range> highlight(@NotNull Query query, @NotNull String text)
        throws CheckedOutOfMemoryError {
    final List<Range> ranges = new ArrayList<Range>();
    /*
     * A formatter is supposed to return formatted text, but since we're
     * only interested in the start and end offsets of the search terms, we
     * return null and store the offsets in a list.
     */
    Formatter nullFormatter = new Formatter() {
        public String highlightTerm(String originalText, TokenGroup tokenGroup) {
            for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
                Token token = tokenGroup.getToken(i);
                if (tokenGroup.getScore(i) == 0)
                    continue;
                int start = token.startOffset();
                int end = token.endOffset();
                ranges.add(new Range(start, end - start));
            }
            return null;
        }
    };
    String key = Fields.CONTENT.key();
    Highlighter highlighter = new Highlighter(nullFormatter, new QueryScorer(query, key));
    highlighter.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    highlighter.setTextFragmenter(new NullFragmenter());
    try {
        /*
         * This has a return value, but we ignore it since we only want the
         * offsets. Might throw an OutOfMemoryError.
         */
        highlighter.getBestFragment(IndexRegistry.analyzer, key, text);
    } catch (OutOfMemoryError e) {
        throw new CheckedOutOfMemoryError(e);
    } catch (Exception e) {
        Util.printErr(e);
    }
    return ranges;
}

From source file:org.apache.jena.query.text.TextIndexLucene.java

License:Apache License

private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, String field,
        String highlight) throws IOException, InvalidTokenOffsetsException {
    List<TextHit> results = new ArrayList<>();

    HighlightOpts opts = new HighlightOpts(highlight);

    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(opts.start, opts.end);
    Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
    highlighter.setTextFragmenter(new SimpleFragmenter(opts.fragSize));

    for (ScoreDoc sd : sDocs) {
        Document doc = indexSearcher.doc(sd.doc);
        log.trace("highlightResults[{}]: {}", sd.doc, doc);
        String entity = doc.get(docDef.getEntityField());

        Node literal = null;// ww w.ja  v a  2  s .  c o  m
        String lexical = doc.get(field);
        if (lexical != null) {
            String docLang = doc.get(docDef.getLangField());
            TokenStream tokenStream = analyzer.tokenStream(field, lexical);
            TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags,
                    opts.maxFrags);
            String rez = frags2string(frags, opts);

            literal = NodeFactory.createLiteral(rez, docLang);
        }

        String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null;
        Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null;

        Node subject = TextQueryFuncs.stringToNode(entity);
        TextHit hit = new TextHit(subject, sd.score, literal, graph);
        results.add(hit);
    }
    return results;
}

From source file:org.apache.maven.index.DefaultIteratorResultSet.java

License:Apache License

protected final List<String> getBestFragments(Query query, Formatter formatter, TokenStream tokenStream,
        String text, int maxNumFragments) throws IOException {
    Highlighter highlighter = new Highlighter(formatter, new CleaningEncoder(), new QueryScorer(query));

    highlighter.setTextFragmenter(new OneLineFragmenter());

    maxNumFragments = Math.max(1, maxNumFragments); // sanity check

    TextFragment[] frag;/*from   w  w  w . j a  va  2s.  c  o m*/
    // Get text
    ArrayList<String> fragTexts = new ArrayList<>(maxNumFragments);

    try {
        frag = highlighter.getBestTextFragments(tokenStream, text, false, maxNumFragments);

        for (int i = 0; i < frag.length; i++) {
            if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                fragTexts.add(frag[i].toString());
            }
        }
    } catch (InvalidTokenOffsetsException e) {
        // empty?
    }

    return fragTexts;
}

From source file:org.apache.nutch.searcher.Summarizer.java

License:Apache License

public static String getsummary(String queryString, String content, Analyzer analyzer) {
    if (queryString == null && content != null) {
        if (content.length() > SUM_LENGTH)
            return content.substring(0, (SUM_LENGTH) - 1);
        else/*  w  w  w.j a  va  2s.  c om*/
            return content;
    } else if (queryString != null && content == null)
        return "";
    else if (queryString == null && content == null)
        return "";
    SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter(cssfront, cssend);

    org.apache.lucene.search.Query summarizerQuery = null;
    QueryParser queryParse = new QueryParser("content", analyzer);
    try {
        summarizerQuery = queryParse.parse(queryString);
    } catch (ParseException ex) {
        if (content.length() > SUM_LENGTH)
            return content.substring(0, (SUM_LENGTH) - 1);
        else
            return content;
    }
    QueryScorer qs = new QueryScorer(summarizerQuery);
    Highlighter highlighter = new Highlighter(sHtmlF, qs);
    highlighter.setTextFragmenter(new SimpleFragmenter(SUM_LENGTH));
    TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content));
    String str;
    try {
        str = highlighter.getBestFragment(tokenStream, content);
    } catch (IOException e) {
        str = null;
    }
    if (str == null) {
        if (content.length() > SUM_LENGTH)
            str = content.substring(0, (SUM_LENGTH) - 1);
        else
            str = content;
    }
    return str;
}