net.sf.jtmt.summarizers.SummaryAnalyzer.java Source code

Introduction

Here is the source code for net.sf.jtmt.summarizers.SummaryAnalyzer.java
Source

/*
 * Copyright 2012 Nabeel Mukhtar 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and
 * limitations under the License. 
 * 
 */
package net.sf.jtmt.summarizers;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import net.sf.jtmt.tokenizers.lucene.NumericTokenFilter;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

/**
 * The Class SummaryAnalyzer.
 */
public class SummaryAnalyzer extends Analyzer {

    /** The stopset. */
    private Set<Object> stopset;

    /**
     * Instantiates a new summary analyzer.
     *
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public SummaryAnalyzer() throws IOException {
        String[] stopwords = filterComments(StringUtils.split(FileUtils.readFileToString(
                new File(getClass().getResource("/resources/jtmt/stopwords.txt").getFile()), "UTF-8")));
        this.stopset = StopFilter.makeStopSet(stopwords, true);
    }

    /* (non-Javadoc)
     * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader)
     */
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
        return new PorterStemFilter(new StopFilter(false, // enable_position_increment_default == false, for backward compat
                new LowerCaseFilter(new NumericTokenFilter(new StandardFilter(new StandardTokenizer(reader)))),
                stopset));
    }

    /**
     * Filter comments.
     *
     * @param input the input
     * @return the string[]
     */
    private String[] filterComments(String[] input) {
        List<String> stopwords = new ArrayList<String>();
        for (String stopword : input) {
            if (!stopword.startsWith("#")) {
                stopwords.add(stopword);
            }
        }
        return stopwords.toArray(new String[0]);
    }
}