net.sf.jtmt.summarizers.LuceneSummarizer.java Source code

Introduction

Here is the source code for net.sf.jtmt.summarizers.LuceneSummarizer.java
Source

/*
 * Copyright 2012 Nabeel Mukhtar 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and
 * limitations under the License. 
 * 
 */
package net.sf.jtmt.summarizers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

import net.sf.jtmt.clustering.ByValueComparator;
import net.sf.jtmt.tokenizers.ParagraphTokenizer;
import net.sf.jtmt.tokenizers.SentenceTokenizer;

import org.apache.commons.collections15.comparators.ReverseComparator;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

/**
 * The Class LuceneSummarizer.
 */
public class LuceneSummarizer {

    /** The analyzer. */
    private Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);

    /** The num sentences. */
    private int numSentences = 2;

    /** The top term cutoff. */
    private float topTermCutoff;
    // these two values are used to implement a simple linear deboost. If 
    // a different algorithm is desired, these variables are likely to be
    // no longer required.
    /** The sentence deboost. */
    private float sentenceDeboost;

    /** The sentence deboost base. */
    private float sentenceDeboostBase = 0.5F;

    /** The paragraph tokenizer. */
    private ParagraphTokenizer paragraphTokenizer;

    /** The sentence tokenizer. */
    private SentenceTokenizer sentenceTokenizer;

    /**
     * Sets the analyzer.
     *
     * @param analyzer the new analyzer
     */
    public void setAnalyzer(Analyzer analyzer) {
        this.analyzer = analyzer;
    }

    /**
     * Sets the num sentences.
     *
     * @param numSentences the new num sentences
     */
    public void setNumSentences(int numSentences) {
        this.numSentences = numSentences;
    }

    /**
     * Sets the top term cutoff.
     *
     * @param topTermCutoff the new top term cutoff
     */
    public void setTopTermCutoff(float topTermCutoff) {
        if (topTermCutoff < 0.0F || topTermCutoff > 1.0F) {
            throw new IllegalArgumentException("Invalid value: 0.0F <= topTermCutoff <= 1.0F");
        }
        this.topTermCutoff = topTermCutoff;
    }

    /**
     * Sets the sentence deboost.
     *
     * @param sentenceDeboost the new sentence deboost
     */
    public void setSentenceDeboost(float sentenceDeboost) {
        if (sentenceDeboost < 0.0F || sentenceDeboost > 1.0F) {
            throw new IllegalArgumentException("Invalid value: 0.0F <= sentenceDeboost <= 1.0F");
        }
        this.sentenceDeboost = sentenceDeboost;
    }

    /**
     * Sets the sentence deboost base.
     *
     * @param sentenceDeboostBase the new sentence deboost base
     */
    public void setSentenceDeboostBase(float sentenceDeboostBase) {
        if (sentenceDeboostBase < 0.0F || sentenceDeboostBase > 1.0F) {
            throw new IllegalArgumentException("Invalid value: 0.0F <= sentenceDeboostBase <= 1.0F");
        }
        this.sentenceDeboostBase = sentenceDeboostBase;
    }

    /**
     * Inits the.
     *
     * @throws Exception the exception
     */
    public void init() throws Exception {
        this.paragraphTokenizer = new ParagraphTokenizer();
        this.sentenceTokenizer = new SentenceTokenizer();
    }

    /**
     * Summarize.
     *
     * @param text the text
     * @return the string
     * @throws Exception the exception
     */
    public String summarize(String text) throws Exception {
        RAMDirectory ramdir = new RAMDirectory();
        buildIndex(ramdir, text);
        Query topTermQuery = computeTopTermQuery(ramdir);
        String[] sentences = searchIndex(ramdir, topTermQuery);
        return StringUtils.join(sentences, " ... ");
    }

    /**
     * Builds the index.
     *
     * @param ramdir the ramdir
     * @param text the text
     * @throws Exception the exception
     */
    private void buildIndex(Directory ramdir, String text) throws Exception {
        if (paragraphTokenizer == null || sentenceTokenizer == null) {
            throw new IllegalArgumentException("Please call init() to instantiate tokenizers");
        }
        IndexWriter writer = new IndexWriter(ramdir, analyzer, MaxFieldLength.UNLIMITED);
        paragraphTokenizer.setText(text);
        String paragraph = null;
        int pno = 0;
        while ((paragraph = paragraphTokenizer.nextParagraph()) != null) {
            sentenceTokenizer.setText(paragraph);
            String sentence = null;
            int sno = 0;
            while ((sentence = sentenceTokenizer.nextSentence()) != null) {
                Document doc = new Document();
                doc.add(new Field("text", sentence, Store.YES, Index.ANALYZED));
                doc.setBoost(computeDeboost(pno, sno));
                writer.addDocument(doc);
                sno++;
            }
            pno++;
        }
        writer.commit();
        writer.close();
    }

    /**
     * Compute deboost.
     *
     * @param paragraphNumber the paragraph number
     * @param sentenceNumber the sentence number
     * @return the float
     */
    private float computeDeboost(int paragraphNumber, int sentenceNumber) {
        if (paragraphNumber > 0) {
            if (sentenceNumber > 0) {
                float deboost = 1.0F - (sentenceNumber * sentenceDeboost);
                deboost = (deboost < sentenceDeboostBase) ? sentenceDeboostBase : deboost;
                return deboost;
            }
        }
        return 1.0F;
    }

    /**
     * Compute top term query.
     *
     * @param ramdir the ramdir
     * @return the query
     * @throws Exception the exception
     */
    private Query computeTopTermQuery(Directory ramdir) throws Exception {
        final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
        List<String> termlist = new ArrayList<String>();
        IndexReader reader = IndexReader.open(ramdir, true);
        TermEnum terms = reader.terms();
        while (terms.next()) {
            Term term = terms.term();
            String termText = term.text();
            int frequency = reader.docFreq(term);
            frequencyMap.put(termText, frequency);
            termlist.add(termText);
        }
        reader.close();
        // sort the term map by frequency descending
        Collections.sort(termlist,
                new ReverseComparator<String>(new ByValueComparator<String, Integer>(frequencyMap)));
        // retrieve the top terms based on topTermCutoff
        List<String> topTerms = new ArrayList<String>();
        float topFreq = -1.0F;
        for (String term : termlist) {
            if (topFreq < 0.0F) {
                // first term, capture the value
                topFreq = (float) frequencyMap.get(term);
                topTerms.add(term);
            } else {
                // not the first term, compute the ratio and discard if below
                // topTermCutoff score
                float ratio = (float) ((float) frequencyMap.get(term) / topFreq);
                if (ratio >= topTermCutoff) {
                    topTerms.add(term);
                } else {
                    break;
                }
            }
        }
        StringBuilder termBuf = new StringBuilder();
        BooleanQuery q = new BooleanQuery();
        for (String topTerm : topTerms) {
            termBuf.append(topTerm).append("(").append(frequencyMap.get(topTerm)).append(");");
            q.add(new TermQuery(new Term("text", topTerm)), Occur.SHOULD);
        }
        System.out.println(">>> top terms: " + termBuf.toString());
        System.out.println(">>> query: " + q.toString());
        return q;
    }

    /**
     * Search index.
     *
     * @param ramdir the ramdir
     * @param query the query
     * @return the string[]
     * @throws Exception the exception
     */
    private String[] searchIndex(Directory ramdir, Query query) throws Exception {
        SortedMap<Integer, String> sentenceMap = new TreeMap<Integer, String>();
        IndexSearcher searcher = new IndexSearcher(ramdir, true);
        TopDocs topDocs = searcher.search(query, numSentences);
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            int docId = scoreDoc.doc;
            Document doc = searcher.doc(docId);
            sentenceMap.put(scoreDoc.doc, StringUtils.chomp(doc.get("text")));
        }
        searcher.close();
        return sentenceMap.values().toArray(new String[0]);
    }
}