de.tudarmstadt.ukp.dkpro.core.jazzy.SpellChecker.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.jazzy.SpellChecker.java
Source

/*******************************************************************************
 * Copyright 2010
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.core.jazzy;

import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.util.JCasUtil.select;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;

import com.swabunga.spell.engine.SpellDictionary;
import com.swabunga.spell.engine.SpellDictionaryHashMap;
import com.swabunga.spell.engine.Word;

import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly;
import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

/**
 * This annotator uses Jazzy for the decision whether a word is spelled correctly or not.
 */

@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = {
        "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly",
        "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction" })

public class SpellChecker extends JCasAnnotator_ImplBase {
    /**
     * Location from which the model is read. The model file is a simple word-list with one word
     * per line.
     */
    public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
    @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true)
    private String dictPath;

    /**
     * The character encoding used by the model.
     */
    public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING;
    @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8")
    private String dictEncoding;

    /**
     * Determines the maximum edit distance (as an int value) that a suggestion for a spelling error may have.
     * E.g. if set to one suggestions are limited to words within edit distance 1 to the original word.
     */
    public static final String PARAM_SCORE_THRESHOLD = "ScoreThreshold";
    @ConfigurationParameter(name = PARAM_SCORE_THRESHOLD, mandatory = true, defaultValue = "1")
    private int scoreThreshold;

    private SpellDictionary dict;

    @Override
    public void initialize(final UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        InputStream is = null;
        try {
            URL url = ResourceUtils.resolveLocation(dictPath, this, context);
            this.getLogger().debug("Loading dictionary from " + url);
            is = url.openStream();
            dict = new SpellDictionaryHashMap(new InputStreamReader(is, dictEncoding));
        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        } finally {
            closeQuietly(is);
        }
    }

    @Override
    public void process(final JCas jcas) throws AnalysisEngineProcessException {
        for (Token t : select(jcas, Token.class)) {
            String tokenText = t.getCoveredText();
            if (tokenText.matches("[\\.\\?\\!]")) {
                continue;
            }
            if (!dict.isCorrect(tokenText)) {
                SpellingAnomaly anomaly = new SpellingAnomaly(jcas, t.getBegin(), t.getEnd());

                // only try to correct single character tokens if they are letters
                if (tokenText.length() == 1 && !Character.isLetter(tokenText.charAt(0))) {
                    continue;
                }

                @SuppressWarnings("unchecked")
                List<Word> suggestions = dict.getSuggestions(tokenText, scoreThreshold);

                SuggestionCostTuples tuples = new SuggestionCostTuples();
                for (Word suggestion : suggestions) {
                    String suggestionString = suggestion.getWord();
                    int cost = suggestion.getCost();

                    if (suggestionString != null) {
                        tuples.addTuple(suggestionString, cost);
                    }
                }

                if (tuples.size() > 0) {
                    FSArray actions = new FSArray(jcas, tuples.size());
                    int i = 0;
                    for (SuggestionCostTuple tuple : tuples) {
                        SuggestedAction action = new SuggestedAction(jcas);
                        action.setReplacement(tuple.getSuggestion());
                        action.setCertainty(tuple.getNormalizedCost(tuples.getMaxCost()));

                        actions.set(i, action);
                        i++;
                    }
                    anomaly.setSuggestions(actions);
                    anomaly.addToIndexes();
                }
            }
        }
    }

    class SuggestionCostTuples implements Iterable<SuggestionCostTuple> {
        private final List<SuggestionCostTuple> tuples;
        private int maxCost;

        public SuggestionCostTuples() {
            tuples = new ArrayList<SuggestionCostTuple>();
            maxCost = 0;
        }

        public void addTuple(String suggestion, int cost) {
            tuples.add(new SuggestionCostTuple(suggestion, cost));

            if (cost > maxCost) {
                maxCost = cost;
            }
        }

        public int getMaxCost() {
            return maxCost;
        }

        public int size() {
            return tuples.size();
        }

        @Override
        public Iterator<SuggestionCostTuple> iterator() {
            return tuples.iterator();
        }
    }

    class SuggestionCostTuple {
        private final String suggestion;
        private final Integer cost;

        public SuggestionCostTuple(String suggestion, Integer cost) {
            this.suggestion = suggestion;
            this.cost = cost;
        }

        public String getSuggestion() {
            return suggestion;
        }

        public Integer getCost() {
            return cost;
        }

        public float getNormalizedCost(int maxCost) {
            if (maxCost > 0) {
                return (float) cost / maxCost;
            } else {
                return 0f;
            }
        }
    }
}