Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.summary.lucene; // JDK imports import java.io.StringReader; import java.util.ArrayList; // Hadoop imports import org.apache.hadoop.conf.Configuration; // Lucene imports import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.WeightedTerm; // Nutch imports import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.Summarizer; import org.apache.nutch.searcher.Summary; import org.apache.nutch.searcher.Summary.Ellipsis; import org.apache.nutch.searcher.Summary.Fragment; import org.apache.nutch.searcher.Summary.Highlight; /** Implements hit summarization. */ public class LuceneSummarizer implements Summarizer { private final static String SEPARATOR = "###"; private final static Formatter FORMATTER = new SimpleHTMLFormatter(SEPARATOR, SEPARATOR); /** Converts text to tokens. */ private Analyzer analyzer = null; private Configuration conf = null; public LuceneSummarizer() { } private LuceneSummarizer(Configuration conf) { setConf(conf); } /* ----------------------------- * * <implementation:Configurable> * * ----------------------------- */ public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; this.analyzer = new NutchDocumentAnalyzer(conf); } /* ------------------------------ * * </implementation:Configurable> * * ------------------------------ */ /* --------------------------- * * <implementation:Summarizer> * * --------------------------- */ public Summary getSummary(String text, Query query) { String[] terms = query.getTerms(); WeightedTerm[] weighted = new WeightedTerm[terms.length]; for (int i = 0; i < terms.length; i++) { weighted[i] = new WeightedTerm(1.0f, terms[i]); } Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted)); TokenStream tokens = analyzer.tokenStream("content", new StringReader(text)); Summary summary = new Summary(); try { // TODO : The max number of fragments (3) should be configurable String[] result = highlighter.getBestFragments(tokens, text, 3); for (int i = 0; i < result.length; i++) { String[] parts = result[i].split(SEPARATOR); boolean highlight = false; for (int j = 0; j < parts.length; j++) { if (highlight) { summary.add(new Highlight(parts[j])); } else { summary.add(new Fragment(parts[j])); } highlight = !highlight; } summary.add(new Ellipsis()); } /* TODO MC BUG resolved 0000029 - if query terms do not occur on text, an empty summary is returned. Now it sends the first tokens. */ if (result == null || result.length == 0) { tokens = analyzer.tokenStream("content", new StringReader(text)); Token firstToken = null, lastToken = null; Token token = null; int maxLen = 100; // the same as defined in SimpleFragmenter but it is private /* ArrayList<Token> titleTokens=new ArrayList<Token>(); ArrayList<Token> textTokens=new ArrayList<Token>(); boolean titleMatched=false; boolean hasMatched=false; // exit match after match title the first time // remove title from text. compares pairs of text while ((titleMatched || !hasMatched) && (token=tokens.next())!=null) { if (token.type().equals("<WORD>")) { if (titleTokens.size()==0) { titleTokens.add(token); } else if (textTokens.size()<titleTokens.size()) { textTokens.add(token); } if (textTokens.size()==titleTokens.size()) { // compare titleMatched=true; for (int i=0;i<textTokens.size() && titleMatched;i++) { if (!textTokens.get(i).termText().equals(titleTokens.get(i).termText())) { titleMatched=false; } } if (titleMatched) { // try to match a larger pattern titleTokens.add(textTokens.get(0)); textTokens.remove(0); hasMatched=true; } else { // remove rest of title from text if (hasMatched) { firstToken=textTokens.get(titleTokens.size()-2); } else { // add one more token to title titleTokens.add(textTokens.get(0)); textTokens.remove(0); } } } } } if (textTokens.size()==0) { return summary; } for (int i=0;i<textTokens.size() && textTokens.get(i).endOffset()-firstToken.startOffset()<maxLen;i++) { lastToken=textTokens.get(i); } */ // read tokens until maxLen while ((token = tokens.next()) != null) { if (token.type().equals("<WORD>")) { if (firstToken == null) { firstToken = token; } else if (token.endOffset() - firstToken.startOffset() < maxLen) { lastToken = token; } else { break; } } } if (lastToken == null) { lastToken = firstToken; } summary.add(new Fragment(text.substring(firstToken.startOffset(), lastToken.endOffset()))); summary.add(new Ellipsis()); } /* TODO MC */ } catch (Exception e) { // Nothing to do... } return summary; } /* ---------------------------- * * </implementation:Summarizer> * * ---------------------------- */ }