org.paxle.se.index.lucene.impl.StopwordsManager.java Source code

Java tutorial

Introduction

Here is the source code for org.paxle.se.index.lucene.impl.StopwordsManager.java

Source

/**
 * This file is part of the Paxle project.
 * Visit http://www.paxle.net for more information.
 * Copyright 2007-2010 the original author or authors.
 *
 * Licensed under the terms of the Common Public License 1.0 ("CPL 1.0").
 * Any use, reproduction or distribution of this program constitutes the recipient's acceptance of this agreement.
 * The full license text is available under http://www.opensource.org/licenses/cpl1.0.txt
 * or in the file LICENSE.txt in the root directory of the Paxle distribution.
 *
 * Unless required by applicable law or agreed to in writing, this software is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

package org.paxle.se.index.lucene.impl;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Service;
import org.apache.lucene.analysis.StopAnalyzer;
import org.osgi.service.component.ComponentContext;

@Component(immediate = true, metatype = false)
@Service(IStopwordsManager.class)
public class StopwordsManager implements IStopwordsManager {
    public static final String STOPWORDS_FILE_EXT = ".stopwords";

    private final HashMap<String, URL> urlMap = new HashMap<String, URL>();
    private final HashMap<String, PaxleAnalyzer> analyzerMap = new HashMap<String, PaxleAnalyzer>();

    /**
     * For logging
     */
    private final Log logger = LogFactory.getLog(StopwordsManager.class);

    /**
     * The default analyzer
     */
    PaxleAnalyzer defaultAnalyzer = null;

    @Activate
    protected void activate(ComponentContext context) {
        @SuppressWarnings("unchecked")
        final Enumeration<URL> stopwords = context.getBundleContext().getBundle()
                .findEntries("/stopwords/snowball/", "*" + StopwordsManager.STOPWORDS_FILE_EXT, true);
        final Collection<URL> stopWordsURLs = (stopwords != null) ? Collections.list(stopwords) : null;
        this.initStopWords(stopWordsURLs);
    }

    void initStopWords(Collection<URL> stopWordsURLs) {
        this.urlMap.clear();
        this.analyzerMap.clear();

        if (stopWordsURLs != null) {
            for (URL stopWordsFile : stopWordsURLs) {
                String fileName = stopWordsFile.getFile();
                fileName = fileName.substring(fileName.lastIndexOf('/') + 1);
                String lang = fileName.substring(0, fileName.length() - STOPWORDS_FILE_EXT.length());

                this.urlMap.put(lang, stopWordsFile);
            }
        }
    }

    /**
     * Returns a PaxleAnalyzer for the given language
     * @param language
     */
    public PaxleAnalyzer getAnalyzer(final String language) {
        this.logger.debug("providing analyzer for language '" + language + "'");
        if (language == null)
            return getDefaultAnalyzer();

        PaxleAnalyzer pa = analyzerMap.get(language.toLowerCase());
        if (pa == null) {
            final URL swURL = urlMap.get(language.toLowerCase());
            if (swURL == null) {
                logger.warn("no stopwords declaration file found for language '" + language
                        + "', falling back to lucene's default");
                return getDefaultAnalyzer();
            }

            Reader reader = null;
            try {
                reader = new InputStreamReader(swURL.openStream(), Charset.forName("UTF-8"));

                // TODO adapt to multiple formats, i.e. pass wordlist-loader to this instance on creation
                final Set<String> stopwords = this.getSnowballSet(reader);
                pa = new PaxleAnalyzer(stopwords);

                analyzerMap.put(language, pa);
            } catch (Throwable e) {
                logger.error(String.format(
                        "Unexpected %s reading in stopwords declaration '%s', falling back to lucene's default.",
                        e.getClass().getName(), swURL), e);
            } finally {
                if (reader != null)
                    try {
                        reader.close();
                    } catch (Exception e) {
                        /* ignore this */}
            }
        }
        return (pa != null) ? pa : getDefaultAnalyzer();
    }

    /**
     * Returns a very simple analyzer for English stopwords.
     * The stopwords are builtin!
     */
    @SuppressWarnings("unchecked")
    public PaxleAnalyzer getDefaultAnalyzer() {
        if (defaultAnalyzer == null)
            defaultAnalyzer = new PaxleAnalyzer(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
        return defaultAnalyzer;
    }

    private Set<String> getSnowballSet(final Reader reader) throws IOException {
        final BufferedReader br = (reader instanceof BufferedReader) ? (BufferedReader) reader
                : new BufferedReader(reader);
        final Set<String> r = new HashSet<String>();
        try {
            String line;
            while ((line = br.readLine()) != null) {
                final int comment = line.indexOf('|');
                if (comment > -1)
                    line = line.substring(0, comment);
                line = line.trim();
                if (line.length() > 0)
                    r.add(line);
            }
        } finally {
            br.close();
        }
        return r;
    }
}