Java tutorial
/* * Copyright 2012 Nabeel Mukhtar * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package net.sf.jtmt.summarizers; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import net.sf.jtmt.clustering.ByValueComparator; import net.sf.jtmt.tokenizers.ParagraphTokenizer; import net.sf.jtmt.tokenizers.SentenceTokenizer; import org.apache.commons.collections15.comparators.ReverseComparator; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; /** * The Class LuceneSummarizer. */ public class LuceneSummarizer { /** The analyzer. */ private Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); /** The num sentences. */ private int numSentences = 2; /** The top term cutoff. */ private float topTermCutoff; // these two values are used to implement a simple linear deboost. If // a different algorithm is desired, these variables are likely to be // no longer required. /** The sentence deboost. */ private float sentenceDeboost; /** The sentence deboost base. */ private float sentenceDeboostBase = 0.5F; /** The paragraph tokenizer. */ private ParagraphTokenizer paragraphTokenizer; /** The sentence tokenizer. */ private SentenceTokenizer sentenceTokenizer; /** * Sets the analyzer. * * @param analyzer the new analyzer */ public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } /** * Sets the num sentences. * * @param numSentences the new num sentences */ public void setNumSentences(int numSentences) { this.numSentences = numSentences; } /** * Sets the top term cutoff. * * @param topTermCutoff the new top term cutoff */ public void setTopTermCutoff(float topTermCutoff) { if (topTermCutoff < 0.0F || topTermCutoff > 1.0F) { throw new IllegalArgumentException("Invalid value: 0.0F <= topTermCutoff <= 1.0F"); } this.topTermCutoff = topTermCutoff; } /** * Sets the sentence deboost. * * @param sentenceDeboost the new sentence deboost */ public void setSentenceDeboost(float sentenceDeboost) { if (sentenceDeboost < 0.0F || sentenceDeboost > 1.0F) { throw new IllegalArgumentException("Invalid value: 0.0F <= sentenceDeboost <= 1.0F"); } this.sentenceDeboost = sentenceDeboost; } /** * Sets the sentence deboost base. * * @param sentenceDeboostBase the new sentence deboost base */ public void setSentenceDeboostBase(float sentenceDeboostBase) { if (sentenceDeboostBase < 0.0F || sentenceDeboostBase > 1.0F) { throw new IllegalArgumentException("Invalid value: 0.0F <= sentenceDeboostBase <= 1.0F"); } this.sentenceDeboostBase = sentenceDeboostBase; } /** * Inits the. * * @throws Exception the exception */ public void init() throws Exception { this.paragraphTokenizer = new ParagraphTokenizer(); this.sentenceTokenizer = new SentenceTokenizer(); } /** * Summarize. * * @param text the text * @return the string * @throws Exception the exception */ public String summarize(String text) throws Exception { RAMDirectory ramdir = new RAMDirectory(); buildIndex(ramdir, text); Query topTermQuery = computeTopTermQuery(ramdir); String[] sentences = searchIndex(ramdir, topTermQuery); return StringUtils.join(sentences, " ... "); } /** * Builds the index. * * @param ramdir the ramdir * @param text the text * @throws Exception the exception */ private void buildIndex(Directory ramdir, String text) throws Exception { if (paragraphTokenizer == null || sentenceTokenizer == null) { throw new IllegalArgumentException("Please call init() to instantiate tokenizers"); } IndexWriter writer = new IndexWriter(ramdir, analyzer, MaxFieldLength.UNLIMITED); paragraphTokenizer.setText(text); String paragraph = null; int pno = 0; while ((paragraph = paragraphTokenizer.nextParagraph()) != null) { sentenceTokenizer.setText(paragraph); String sentence = null; int sno = 0; while ((sentence = sentenceTokenizer.nextSentence()) != null) { Document doc = new Document(); doc.add(new Field("text", sentence, Store.YES, Index.ANALYZED)); doc.setBoost(computeDeboost(pno, sno)); writer.addDocument(doc); sno++; } pno++; } writer.commit(); writer.close(); } /** * Compute deboost. * * @param paragraphNumber the paragraph number * @param sentenceNumber the sentence number * @return the float */ private float computeDeboost(int paragraphNumber, int sentenceNumber) { if (paragraphNumber > 0) { if (sentenceNumber > 0) { float deboost = 1.0F - (sentenceNumber * sentenceDeboost); deboost = (deboost < sentenceDeboostBase) ? sentenceDeboostBase : deboost; return deboost; } } return 1.0F; } /** * Compute top term query. * * @param ramdir the ramdir * @return the query * @throws Exception the exception */ private Query computeTopTermQuery(Directory ramdir) throws Exception { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); List<String> termlist = new ArrayList<String>(); IndexReader reader = IndexReader.open(ramdir, true); TermEnum terms = reader.terms(); while (terms.next()) { Term term = terms.term(); String termText = term.text(); int frequency = reader.docFreq(term); frequencyMap.put(termText, frequency); termlist.add(termText); } reader.close(); // sort the term map by frequency descending Collections.sort(termlist, new ReverseComparator<String>(new ByValueComparator<String, Integer>(frequencyMap))); // retrieve the top terms based on topTermCutoff List<String> topTerms = new ArrayList<String>(); float topFreq = -1.0F; for (String term : termlist) { if (topFreq < 0.0F) { // first term, capture the value topFreq = (float) frequencyMap.get(term); topTerms.add(term); } else { // not the first term, compute the ratio and discard if below // topTermCutoff score float ratio = (float) ((float) frequencyMap.get(term) / topFreq); if (ratio >= topTermCutoff) { topTerms.add(term); } else { break; } } } StringBuilder termBuf = new StringBuilder(); BooleanQuery q = new BooleanQuery(); for (String topTerm : topTerms) { termBuf.append(topTerm).append("(").append(frequencyMap.get(topTerm)).append(");"); q.add(new TermQuery(new Term("text", topTerm)), Occur.SHOULD); } System.out.println(">>> top terms: " + termBuf.toString()); System.out.println(">>> query: " + q.toString()); return q; } /** * Search index. * * @param ramdir the ramdir * @param query the query * @return the string[] * @throws Exception the exception */ private String[] searchIndex(Directory ramdir, Query query) throws Exception { SortedMap<Integer, String> sentenceMap = new TreeMap<Integer, String>(); IndexSearcher searcher = new IndexSearcher(ramdir, true); TopDocs topDocs = searcher.search(query, numSentences); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { int docId = scoreDoc.doc; Document doc = searcher.doc(docId); sentenceMap.put(scoreDoc.doc, StringUtils.chomp(doc.get("text"))); } searcher.close(); return sentenceMap.values().toArray(new String[0]); } }