org.apache.nutch.summary.lucene.LuceneSummarizer.java Source code

Introduction

Here is the source code for org.apache.nutch.summary.lucene.LuceneSummarizer.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.summary.lucene;

// JDK imports
import java.io.StringReader;
import java.util.ArrayList;

// Hadoop imports
import org.apache.hadoop.conf.Configuration;

// Lucene imports
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.WeightedTerm;

// Nutch imports
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Summarizer;
import org.apache.nutch.searcher.Summary;
import org.apache.nutch.searcher.Summary.Ellipsis;
import org.apache.nutch.searcher.Summary.Fragment;
import org.apache.nutch.searcher.Summary.Highlight;

/** Implements hit summarization. */
public class LuceneSummarizer implements Summarizer {

    private final static String SEPARATOR = "###";
    private final static Formatter FORMATTER = new SimpleHTMLFormatter(SEPARATOR, SEPARATOR);

    /** Converts text to tokens. */
    private Analyzer analyzer = null;
    private Configuration conf = null;

    public LuceneSummarizer() {
    }

    private LuceneSummarizer(Configuration conf) {
        setConf(conf);
    }

    /* ----------------------------- *
     * <implementation:Configurable> *
     * ----------------------------- */

    public Configuration getConf() {
        return conf;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.analyzer = new NutchDocumentAnalyzer(conf);
    }

    /* ------------------------------ *
     * </implementation:Configurable> *
     * ------------------------------ */

    /* --------------------------- *
     * <implementation:Summarizer> *
     * --------------------------- */

    public Summary getSummary(String text, Query query) {

        String[] terms = query.getTerms();
        WeightedTerm[] weighted = new WeightedTerm[terms.length];
        for (int i = 0; i < terms.length; i++) {
            weighted[i] = new WeightedTerm(1.0f, terms[i]);
        }
        Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
        TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
        Summary summary = new Summary();
        try {
            // TODO : The max number of fragments (3) should be configurable
            String[] result = highlighter.getBestFragments(tokens, text, 3);
            for (int i = 0; i < result.length; i++) {
                String[] parts = result[i].split(SEPARATOR);
                boolean highlight = false;
                for (int j = 0; j < parts.length; j++) {
                    if (highlight) {
                        summary.add(new Highlight(parts[j]));
                    } else {
                        summary.add(new Fragment(parts[j]));
                    }
                    highlight = !highlight;
                }
                summary.add(new Ellipsis());
            }

            /* TODO MC  BUG resolved 0000029 - if query terms do not occur on text, an empty summary is returned. Now it sends the first tokens. */
            if (result == null || result.length == 0) {
                tokens = analyzer.tokenStream("content", new StringReader(text));

                Token firstToken = null, lastToken = null;
                Token token = null;
                int maxLen = 100; // the same as defined in SimpleFragmenter but it is private

                /*
                ArrayList<Token> titleTokens=new ArrayList<Token>();
                ArrayList<Token> textTokens=new ArrayList<Token>();
                boolean titleMatched=false;
                boolean hasMatched=false; // exit match after match title the first time               
                    
                // remove title from text. compares pairs of text
                while ((titleMatched || !hasMatched) && (token=tokens.next())!=null) {
                       
                   if (token.type().equals("<WORD>")) {
                       
                      if (titleTokens.size()==0) {
                         titleTokens.add(token);
                      }
                      else if (textTokens.size()<titleTokens.size()) {
                         textTokens.add(token);
                      }
                       
                      if (textTokens.size()==titleTokens.size()) {
                         // compare
                         titleMatched=true;
                         for (int i=0;i<textTokens.size() && titleMatched;i++) {
                 if (!textTokens.get(i).termText().equals(titleTokens.get(i).termText())) {
                    titleMatched=false;     
                 }                     
                         }
                         if (titleMatched) { // try to match a larger pattern
                 titleTokens.add(textTokens.get(0));
                 textTokens.remove(0);
                 hasMatched=true;
                         }
                         else { // remove rest of title from text
                 if (hasMatched) {
                    firstToken=textTokens.get(titleTokens.size()-2);                                              
                 }
                 else { // add one more token to title
                    titleTokens.add(textTokens.get(0));
                     textTokens.remove(0);
                 }
                         }
                      }
                   }          
                }
                    
                if (textTokens.size()==0) {
                   return summary;
                }
                                  
                for (int i=0;i<textTokens.size() && textTokens.get(i).endOffset()-firstToken.startOffset()<maxLen;i++) {
                   lastToken=textTokens.get(i);
                }
                */

                // read tokens until maxLen
                while ((token = tokens.next()) != null) {
                    if (token.type().equals("<WORD>")) {
                        if (firstToken == null) {
                            firstToken = token;
                        } else if (token.endOffset() - firstToken.startOffset() < maxLen) {
                            lastToken = token;
                        } else {
                            break;
                        }
                    }
                }
                if (lastToken == null) {
                    lastToken = firstToken;
                }

                summary.add(new Fragment(text.substring(firstToken.startOffset(), lastToken.endOffset())));
                summary.add(new Ellipsis());
            }
            /* TODO MC */

        } catch (Exception e) {
            // Nothing to do...
        }
        return summary;
    }

    /* ---------------------------- *
     * </implementation:Summarizer> *
     * ---------------------------- */

}