gate.bulstem.BulStemPR.java Source code

Introduction

Here is the source code for gate.bulstem.BulStemPR.java
Source

/*
 * BulStemPR.java
 * 
 * Copyright (c) 2013 The University of Sheffield.
 * 
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * licenced under the GNU Library General Public License, Version 2, June1991.
 * 
 * A copy of this licence is included in the distribution in the file
 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
 * 
 * Ivelina Nikolova, 05/12/2013
 */
package gate.bulstem;

import gate.Annotation;
import gate.AnnotationSet;
import gate.ProcessingResource;
import gate.Resource;
import gate.Utils;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.net.URL;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/**
 * Stemming algorithm by Preslav Nakov.
 * 
 * @author Alexander Alexandrov, e-mail: sencko@mail.bg, provided the JAVA
 *         implementation of the algorithm
 * @author Ivelina Nikolova, e-mail:iva@lml.bas.bg, wrapped the stemmer for GATE
 */
@CreoleResource(name = "BulStem", helpURL = "http://lml.bas.bg/~nakov/bulstem/", comment = "This plugin is an implementation of the BulStem stemmer algorithm for Bulgarian developed by Preslav Nakov.")
public class BulStemPR extends AbstractLanguageAnalyser implements ProcessingResource, Serializable {

    private static final long serialVersionUID = 257778017962925274L;

    protected Logger logger = Logger.getLogger(this.getClass());

    private URL rulesURL;

    private String annotationSetName;

    private String annotationType;

    private Map<String, String> stemmingRules;;

    // should we make this an init param?
    // at the moment this always excludes 8556 entries from the default rules file
    private static final int STEM_BOUNDARY = 1;

    private Boolean failOnMissingInputAnnotations = true;

    private static final Pattern vocals = Pattern.compile("[^?]*[?]");

    public static final Pattern p = Pattern.compile("([-?]+)\\s==>\\s([-?]+)\\s([0-9]+)");

    @Override
    public Resource init() throws ResourceInstantiationException {

        // check required parameters are set
        if (rulesURL == null) {
            throw new ResourceInstantiationException("rulesURL param must be set");
        }

        stemmingRules = new HashMap<String, String>();

        BufferedReader br = null;
        try {
            br = new BufferedReader(new InputStreamReader(rulesURL.openStream()));
            String s = null;
            while ((s = br.readLine()) != null) {
                Matcher m = p.matcher(s);
                if (m.matches()) {
                    if (Integer.parseInt(m.group(3)) > STEM_BOUNDARY) {
                        stemmingRules.put(m.group(1), m.group(2));
                    }
                }
            }
        } catch (Exception e) {
            throw new ResourceInstantiationException(e);
        } finally {
            if (br != null)
                IOUtils.closeQuietly(br);
        }

        return this;
    }

    @Override
    public void execute() throws ExecutionException {

        // get all the tokens from the specified annotation set
        AnnotationSet allTokens = document.getAnnotations(annotationSetName).get(annotationType);

        if (allTokens.size() > 0) {

            // sort out the status reporting stuff
            long startTime = System.currentTimeMillis();
            fireStatusChanged("Running BulStem over " + document.getName());
            fireProgressChanged(0);
            int tokenCount = 0;

            for (Annotation token : allTokens) {
                // for each Token annotation...

                // get the string feature
                String tokenString = token.getFeatures().get("string").toString();

                // stem the string feature and change it to lowercase
                String stem = stem(tokenString).toLowerCase();

                // store the new feature
                token.getFeatures().put("stem", stem);

                // report our progress
                fireProgressChanged(tokenCount++ * 100 / allTokens.size());
            }

            // we've finished so report this
            fireProcessFinished();
            fireStatusChanged(document.getName() + " stemmed in "
                    + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
                    + " seconds!");
        } else {
            if (failOnMissingInputAnnotations) {
                throw new ExecutionException("No tokens to process in document " + document.getName() + "\n"
                        + "Please run a tokeniser first!");
            } else {
                Utils.logOnce(logger, Level.INFO,
                        "BulStem: no token annotations in input document - see debug log for details.");
                logger.debug("No input annotations in document " + document.getName());
            }
        }
    }

    private String stem(String word) {
        Matcher m = vocals.matcher(word);
        if (!m.lookingAt()) {
            return word;
        }

        for (int i = m.end() + 1; i < word.length(); i++) {
            String suffix = word.substring(i);
            if ((suffix = stemmingRules.get(suffix)) != null) {
                // get the new stem by cutting up the word and adding the right suffix
                // from the rules
                return word.substring(0, i) + suffix;
            }
        }
        return word;
    }

    // PR parameters
    @CreoleParameter(comment = "Stemming Rules File", defaultValue = "resources/stem_rules_context_2_UTF-8.txt")
    public void setPathToRules(URL rulesURL) {
        this.rulesURL = rulesURL;
    }

    public URL getPathToRules() {
        return rulesURL;
    }

    @Optional
    @RunTime
    @CreoleParameter(comment = "The annotation set to use as input")
    public void setAnnotationSetName(String annotationSetName) {
        this.annotationSetName = annotationSetName;
    }

    public String getAnnotationSetName() {
        return annotationSetName;
    }

    @RunTime
    @CreoleParameter(comment = "The name of the base 'Token' annotation type", defaultValue = "Token")
    public void setAnnotationType(String annotationType) {
        this.annotationType = annotationType;
    }

    public String getAnnotationType() {
        return annotationType;
    }

    @RunTime
    @Optional
    @CreoleParameter(comment = "Throw an exception when there are none of the required input annotations", defaultValue = "true")
    public void setFailOnMissingInputAnnotations(Boolean fail) {
        failOnMissingInputAnnotations = fail;
    }

    public Boolean getFailOnMissingInputAnnotations() {
        return failOnMissingInputAnnotations;
    }
}