com.davidbracewell.text.morphology.EnglishLemmatizer.java Source code

Introduction

Here is the source code for com.davidbracewell.text.morphology.EnglishLemmatizer.java
Source

/*
 * (c) 2005 David B. Bracewell
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.davidbracewell.text.morphology;

import com.davidbracewell.collection.CompactHashSet;
import com.davidbracewell.collection.trie.PatriciaTrie;
import com.davidbracewell.config.Config;
import com.davidbracewell.string.StringUtils;
import com.davidbracewell.text.Fragment;
import com.davidbracewell.text.tag.POS;
import com.davidbracewell.tuple.Pair;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.collect.*;
import com.google.common.io.LineProcessor;

import javax.annotation.Nullable;
import java.io.IOException;
import java.io.Serializable;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Set;

/**
 * The type English lemmatizer.
 *
 * @author David B. Bracewell
 */
public class EnglishLemmatizer implements Lemmatizer, Serializable {
    private static final long serialVersionUID = -6093027604295026727L;
    private static EnglishLemmatizer INSTANCE = null;
    private final Multimap<POS, DetachmentRule> rules = ArrayListMultimap.create();
    private final Multimap<Pair<POS, String>, String> exceptions = HashMultimap.create();
    private final PatriciaTrie<Set<POS>> lemmas;

    /**
     * Instantiates a new English lemmatizer.
     */
    protected EnglishLemmatizer() {
        rules.put(POS.NOUN, new DetachmentRule("s", ""));
        rules.put(POS.NOUN, new DetachmentRule("ses", "s"));
        rules.put(POS.NOUN, new DetachmentRule("xes", "x"));
        rules.put(POS.NOUN, new DetachmentRule("zes", "z"));
        rules.put(POS.NOUN, new DetachmentRule("ies", "y"));
        rules.put(POS.NOUN, new DetachmentRule("shes", "sh"));
        rules.put(POS.NOUN, new DetachmentRule("ches", "ch"));
        rules.put(POS.NOUN, new DetachmentRule("men", "man"));
        loadException(POS.NOUN);

        rules.put(POS.VERB, new DetachmentRule("s", ""));
        rules.put(POS.VERB, new DetachmentRule("ies", "y"));
        rules.put(POS.VERB, new DetachmentRule("es", "s"));
        rules.put(POS.VERB, new DetachmentRule("es", ""));
        rules.put(POS.VERB, new DetachmentRule("ed", "e"));
        rules.put(POS.VERB, new DetachmentRule("ed", ""));
        rules.put(POS.VERB, new DetachmentRule("ing", "e"));
        rules.put(POS.VERB, new DetachmentRule("ing", ""));
        loadException(POS.VERB);

        rules.put(POS.ADJECTIVE, new DetachmentRule("er", ""));
        rules.put(POS.ADJECTIVE, new DetachmentRule("est", ""));
        rules.put(POS.ADJECTIVE, new DetachmentRule("er", "e"));
        rules.put(POS.ADJECTIVE, new DetachmentRule("est", "e"));
        loadException(POS.ADJECTIVE);

        loadException(POS.ADVERB);

        try {
            this.lemmas = Config.get(EnglishLemmatizer.class, "dictionary").asResource()
                    .read(new LineProcessor<PatriciaTrie<Set<POS>>>() {
                        private PatriciaTrie<Set<POS>> lemmas = new PatriciaTrie<>();

                        @Override
                        public boolean processLine(String line) throws IOException {
                            if (!Strings.isNullOrEmpty(line) && !line.trim().startsWith("#")) {
                                String[] parts = line.trim().split("\t+");
                                String lemma = parts[0].toLowerCase();
                                POS pos = POS.fromString(parts[1].toUpperCase());
                                if (!lemmas.containsKey(lemma)) {
                                    lemmas.put(lemma, new CompactHashSet<POS>());
                                }
                                lemmas.get(lemma).add(pos);
                            }
                            return true;
                        }

                        @Override
                        public PatriciaTrie<Set<POS>> getResult() {
                            return lemmas;
                        }
                    });
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    /**
     * Gets instance.
     *
     * @return the instance
     */
    public static EnglishLemmatizer getInstance() {
        if (INSTANCE == null) {
            synchronized (EnglishLemmatizer.class) {
                if (INSTANCE != null) {
                    return INSTANCE;
                }
                INSTANCE = new EnglishLemmatizer();
            }
        }
        return INSTANCE;
    }

    private void loadException(POS tag) {
        try {
            for (String line : Config.get(EnglishLemmatizer.class, "exceptions").asResource()
                    .getChild(tag.asString().toLowerCase() + ".exc").readLines()) {
                if (!Strings.isNullOrEmpty(line)) {
                    String[] parts = line.split("\\s+");
                    Pair<POS, String> key = Pair.of(tag, parts[0].replaceAll("_", " "));
                    for (int i = 1; i < parts.length; i++) {
                        exceptions.put(key, parts[i]);
                    }
                }
            }
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    @Override
    public String lemmatize(String string) {
        return Iterables.getFirst(getBaseForms(string), string);
    }

    @Override
    public String lemmatize(String string, POS partOfSpeech) {
        return Iterables.getFirst(getBaseForms(string, partOfSpeech), string);
    }

    @Override
    public String lemmatize(Fragment fragment) {
        Preconditions.checkNotNull(fragment);

        if (fragment.isEmpty()) {
            return StringUtils.EMPTY;
        }

        if (fragment.tokenLength() == 1) {
            return lemmatize(fragment.content(), POS.forText(fragment).getUniversalTag());
        }

        String lemma = "";
        for (int i = 0; i < fragment.tokenLength(); i++) {
            lemma += " " + lemmatize(fragment.tokenAt(i).content(), POS.forText(fragment.tokenAt(i)));
        }
        return lemma.trim();
    }

    @Override
    public Iterable<String> getBaseForms(String string) {
        return getBaseForms(string, POS.ANY);
    }

    @Override
    public Iterable<String> getBaseForms(String string, POS partOfSpeech) {
        Preconditions.checkNotNull(string);
        if (partOfSpeech == null || partOfSpeech == POS.ANY) {
            return doLemmatization(string, POS.NOUN, POS.VERB, POS.ADJECTIVE, POS.ADVERB);
        }
        return doLemmatization(string, partOfSpeech);
    }

    @Override
    public Iterable<String> getBaseForms(Fragment fragment) {
        Preconditions.checkNotNull(fragment);

        if (fragment.isEmpty()) {
            return Collections.emptySet();
        }

        return getBaseForms(fragment.content(), POS.forText(fragment));
    }

    @Override
    public Set<String> getPrefixBaseForms(String string, POS partOfSpeech) {
        Set<String> lemmaSet = Sets.newHashSet();
        for (String lemma : doLemmatization(string, partOfSpeech)) {
            lemmaSet.add(lemma);
            lemmaSet.addAll(lemmas.prefixMap(string + "_").keySet());
        }
        return lemmaSet;
    }

    @Override
    public boolean isLemma(String word) {
        return lemmas.containsKey(word.toLowerCase());
    }

    private Iterable<String> doLemmatization(String string, POS... tags) {
        Set<String> tokenLemmas = new LinkedHashSet<>();
        for (POS tag : tags) {
            fill(string, tag, tokenLemmas);
        }

        if (tokenLemmas.isEmpty()) {
            return Collections.singleton(string);
        }
        return tokenLemmas;
    }

    private void fill(String string, POS partOfSpeech, Set<String> set) {
        if (partOfSpeech.isVerb()) {
            if (string.equalsIgnoreCase("'s") || string.equalsIgnoreCase("'re")) {
                set.add("be");
                return;
            } else if (string.equals("'ll")) {
                set.add("will");
                return;
            } else if (string.equals("'ve")) {
                set.add("will");
                return;
            }
        } else if (partOfSpeech.isAdverb()) {
            if (string.equalsIgnoreCase("n't")) {
                set.add("not");
                return;
            }
        } else if (string.equalsIgnoreCase("'d")) {
            set.add("would");
            return;
        }

        Pair<POS, String> key = Pair.of(partOfSpeech, string.toLowerCase());
        if (exceptions.containsKey(key)) {
            set.addAll(exceptions.get(key));
        }
        for (DetachmentRule rule : rules.get(partOfSpeech.getUniversalTag())) {
            String output = rule.apply(string);
            if (output != null && lemmas.containsKey(output.toLowerCase())
                    && lemmas.get(output.toLowerCase()).contains(partOfSpeech)) {
                set.add(output);
            }
        }
    }

    private static class DetachmentRule implements Serializable, Function<String, String> {
        private static final long serialVersionUID = 2748362312310767937L;
        /**
         * The Ending.
         */
        public final String ending;
        /**
         * The Replacement.
         */
        public final String replacement;

        private DetachmentRule(String ending, String replacement) {
            this.ending = ending;
            this.replacement = replacement;
        }

        /**
         * Unapply string.
         *
         * @param input the input
         * @return the string
         */
        @Nullable
        public String unapply(@Nullable String input) {
            if (input == null) {
                return null;
            }
            if (input.endsWith(replacement)) {
                int end = input.length() - replacement.length();
                if (end == 0) {
                    return ending;
                }
                return input.substring(0, end) + ending;
            }
            return input;
        }

        @Nullable
        @Override
        public String apply(@Nullable String input) {
            if (input == null) {
                return null;
            }
            if (input.endsWith(ending)) {
                int end = input.length() - ending.length();
                if (end == 0) {
                    return replacement;
                }
                return input.substring(0, end) + replacement;
            }
            return input;
        }
    }

}//END OF EnglishLemmatizer