Java tutorial
/* * Copyright 2012 Nabeel Mukhtar * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package net.sf.jtmt.summarizers; import java.io.File; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; import java.util.Set; import net.sf.jtmt.tokenizers.lucene.NumericTokenFilter; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.PorterStemFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; /** * The Class SummaryAnalyzer. */ public class SummaryAnalyzer extends Analyzer { /** The stopset. */ private Set<Object> stopset; /** * Instantiates a new summary analyzer. * * @throws IOException Signals that an I/O exception has occurred. */ public SummaryAnalyzer() throws IOException { String[] stopwords = filterComments(StringUtils.split(FileUtils.readFileToString( new File(getClass().getResource("/resources/jtmt/stopwords.txt").getFile()), "UTF-8"))); this.stopset = StopFilter.makeStopSet(stopwords, true); } /* (non-Javadoc) * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) */ @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new PorterStemFilter(new StopFilter(false, // enable_position_increment_default == false, for backward compat new LowerCaseFilter(new NumericTokenFilter(new StandardFilter(new StandardTokenizer(reader)))), stopset)); } /** * Filter comments. * * @param input the input * @return the string[] */ private String[] filterComments(String[] input) { List<String> stopwords = new ArrayList<String>(); for (String stopword : input) { if (!stopword.startsWith("#")) { stopwords.add(stopword); } } return stopwords.toArray(new String[0]); } }