Java tutorial
/* * (c) 2005 David B. Bracewell * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.davidbracewell.text.morphology; import com.davidbracewell.collection.CompactHashSet; import com.davidbracewell.collection.trie.PatriciaTrie; import com.davidbracewell.config.Config; import com.davidbracewell.string.StringUtils; import com.davidbracewell.text.Fragment; import com.davidbracewell.text.tag.POS; import com.davidbracewell.tuple.Pair; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.base.Throwables; import com.google.common.collect.*; import com.google.common.io.LineProcessor; import javax.annotation.Nullable; import java.io.IOException; import java.io.Serializable; import java.util.Collections; import java.util.LinkedHashSet; import java.util.Set; /** * The type English lemmatizer. * * @author David B. Bracewell */ public class EnglishLemmatizer implements Lemmatizer, Serializable { private static final long serialVersionUID = -6093027604295026727L; private static EnglishLemmatizer INSTANCE = null; private final Multimap<POS, DetachmentRule> rules = ArrayListMultimap.create(); private final Multimap<Pair<POS, String>, String> exceptions = HashMultimap.create(); private final PatriciaTrie<Set<POS>> lemmas; /** * Instantiates a new English lemmatizer. */ protected EnglishLemmatizer() { rules.put(POS.NOUN, new DetachmentRule("s", "")); rules.put(POS.NOUN, new DetachmentRule("ses", "s")); rules.put(POS.NOUN, new DetachmentRule("xes", "x")); rules.put(POS.NOUN, new DetachmentRule("zes", "z")); rules.put(POS.NOUN, new DetachmentRule("ies", "y")); rules.put(POS.NOUN, new DetachmentRule("shes", "sh")); rules.put(POS.NOUN, new DetachmentRule("ches", "ch")); rules.put(POS.NOUN, new DetachmentRule("men", "man")); loadException(POS.NOUN); rules.put(POS.VERB, new DetachmentRule("s", "")); rules.put(POS.VERB, new DetachmentRule("ies", "y")); rules.put(POS.VERB, new DetachmentRule("es", "s")); rules.put(POS.VERB, new DetachmentRule("es", "")); rules.put(POS.VERB, new DetachmentRule("ed", "e")); rules.put(POS.VERB, new DetachmentRule("ed", "")); rules.put(POS.VERB, new DetachmentRule("ing", "e")); rules.put(POS.VERB, new DetachmentRule("ing", "")); loadException(POS.VERB); rules.put(POS.ADJECTIVE, new DetachmentRule("er", "")); rules.put(POS.ADJECTIVE, new DetachmentRule("est", "")); rules.put(POS.ADJECTIVE, new DetachmentRule("er", "e")); rules.put(POS.ADJECTIVE, new DetachmentRule("est", "e")); loadException(POS.ADJECTIVE); loadException(POS.ADVERB); try { this.lemmas = Config.get(EnglishLemmatizer.class, "dictionary").asResource() .read(new LineProcessor<PatriciaTrie<Set<POS>>>() { private PatriciaTrie<Set<POS>> lemmas = new PatriciaTrie<>(); @Override public boolean processLine(String line) throws IOException { if (!Strings.isNullOrEmpty(line) && !line.trim().startsWith("#")) { String[] parts = line.trim().split("\t+"); String lemma = parts[0].toLowerCase(); POS pos = POS.fromString(parts[1].toUpperCase()); if (!lemmas.containsKey(lemma)) { lemmas.put(lemma, new CompactHashSet<POS>()); } lemmas.get(lemma).add(pos); } return true; } @Override public PatriciaTrie<Set<POS>> getResult() { return lemmas; } }); } catch (IOException e) { throw Throwables.propagate(e); } } /** * Gets instance. * * @return the instance */ public static EnglishLemmatizer getInstance() { if (INSTANCE == null) { synchronized (EnglishLemmatizer.class) { if (INSTANCE != null) { return INSTANCE; } INSTANCE = new EnglishLemmatizer(); } } return INSTANCE; } private void loadException(POS tag) { try { for (String line : Config.get(EnglishLemmatizer.class, "exceptions").asResource() .getChild(tag.asString().toLowerCase() + ".exc").readLines()) { if (!Strings.isNullOrEmpty(line)) { String[] parts = line.split("\\s+"); Pair<POS, String> key = Pair.of(tag, parts[0].replaceAll("_", " ")); for (int i = 1; i < parts.length; i++) { exceptions.put(key, parts[i]); } } } } catch (IOException e) { throw Throwables.propagate(e); } } @Override public String lemmatize(String string) { return Iterables.getFirst(getBaseForms(string), string); } @Override public String lemmatize(String string, POS partOfSpeech) { return Iterables.getFirst(getBaseForms(string, partOfSpeech), string); } @Override public String lemmatize(Fragment fragment) { Preconditions.checkNotNull(fragment); if (fragment.isEmpty()) { return StringUtils.EMPTY; } if (fragment.tokenLength() == 1) { return lemmatize(fragment.content(), POS.forText(fragment).getUniversalTag()); } String lemma = ""; for (int i = 0; i < fragment.tokenLength(); i++) { lemma += " " + lemmatize(fragment.tokenAt(i).content(), POS.forText(fragment.tokenAt(i))); } return lemma.trim(); } @Override public Iterable<String> getBaseForms(String string) { return getBaseForms(string, POS.ANY); } @Override public Iterable<String> getBaseForms(String string, POS partOfSpeech) { Preconditions.checkNotNull(string); if (partOfSpeech == null || partOfSpeech == POS.ANY) { return doLemmatization(string, POS.NOUN, POS.VERB, POS.ADJECTIVE, POS.ADVERB); } return doLemmatization(string, partOfSpeech); } @Override public Iterable<String> getBaseForms(Fragment fragment) { Preconditions.checkNotNull(fragment); if (fragment.isEmpty()) { return Collections.emptySet(); } return getBaseForms(fragment.content(), POS.forText(fragment)); } @Override public Set<String> getPrefixBaseForms(String string, POS partOfSpeech) { Set<String> lemmaSet = Sets.newHashSet(); for (String lemma : doLemmatization(string, partOfSpeech)) { lemmaSet.add(lemma); lemmaSet.addAll(lemmas.prefixMap(string + "_").keySet()); } return lemmaSet; } @Override public boolean isLemma(String word) { return lemmas.containsKey(word.toLowerCase()); } private Iterable<String> doLemmatization(String string, POS... tags) { Set<String> tokenLemmas = new LinkedHashSet<>(); for (POS tag : tags) { fill(string, tag, tokenLemmas); } if (tokenLemmas.isEmpty()) { return Collections.singleton(string); } return tokenLemmas; } private void fill(String string, POS partOfSpeech, Set<String> set) { if (partOfSpeech.isVerb()) { if (string.equalsIgnoreCase("'s") || string.equalsIgnoreCase("'re")) { set.add("be"); return; } else if (string.equals("'ll")) { set.add("will"); return; } else if (string.equals("'ve")) { set.add("will"); return; } } else if (partOfSpeech.isAdverb()) { if (string.equalsIgnoreCase("n't")) { set.add("not"); return; } } else if (string.equalsIgnoreCase("'d")) { set.add("would"); return; } Pair<POS, String> key = Pair.of(partOfSpeech, string.toLowerCase()); if (exceptions.containsKey(key)) { set.addAll(exceptions.get(key)); } for (DetachmentRule rule : rules.get(partOfSpeech.getUniversalTag())) { String output = rule.apply(string); if (output != null && lemmas.containsKey(output.toLowerCase()) && lemmas.get(output.toLowerCase()).contains(partOfSpeech)) { set.add(output); } } } private static class DetachmentRule implements Serializable, Function<String, String> { private static final long serialVersionUID = 2748362312310767937L; /** * The Ending. */ public final String ending; /** * The Replacement. */ public final String replacement; private DetachmentRule(String ending, String replacement) { this.ending = ending; this.replacement = replacement; } /** * Unapply string. * * @param input the input * @return the string */ @Nullable public String unapply(@Nullable String input) { if (input == null) { return null; } if (input.endsWith(replacement)) { int end = input.length() - replacement.length(); if (end == 0) { return ending; } return input.substring(0, end) + ending; } return input; } @Nullable @Override public String apply(@Nullable String input) { if (input == null) { return null; } if (input.endsWith(ending)) { int end = input.length() - ending.length(); if (end == 0) { return replacement; } return input.substring(0, end) + replacement; } return input; } } }//END OF EnglishLemmatizer