Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2013 Assaf Urieli // //This file is part of Jochre. // //Jochre is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Jochre is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Jochre. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.jochre.lexicon; import java.text.Normalizer; import java.text.Normalizer.Form; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.joliciel.jochre.JochreSession; /** * For each word in the lexicon, adds variants with an initial uppercase and all upper-case. * @author Assaf Urieli * */ public class DefaultLexiconWrapper implements Lexicon { @SuppressWarnings("unused") private static final Log LOG = LogFactory.getLog(DefaultLexiconWrapper.class); Lexicon baseLexicon; Set<String> upperCaseLexicon = new HashSet<String>(); public DefaultLexiconWrapper(Lexicon baseLexicon) { super(); this.baseLexicon = baseLexicon; Iterator<String> words = baseLexicon.getWords(); while (words.hasNext()) { String word = words.next(); if (word.length() > 0) { String firstLetter = word.substring(0, 1); if (word.length() == 1) upperCaseLexicon.add(this.toUpperCaseNoAccents(firstLetter)); else upperCaseLexicon.add(this.toUpperCaseNoAccents(firstLetter) + word.substring(1)); upperCaseLexicon.add(this.toUpperCaseNoAccents(word)); } } } @Override public int getFrequency(String word) { int frequency = baseLexicon.getFrequency(word); if (frequency > 0) return frequency; if (upperCaseLexicon.contains(word)) return 1; return 0; } String toUpperCaseNoAccents(String string) { // decompose accents String decomposed = Normalizer.normalize(string, Form.NFD); // removing diacritics String removed = decomposed.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); String uppercase = removed.toUpperCase(JochreSession.getLocale()); return uppercase; } char getLetterWithoutAccents(char letter) { switch (letter) { case '': case '': case '': case '': case '': case '': return 'a'; case '': case '': case '': case '?': case '': return 'c'; case '?': case '': return 'd'; case '': case '': case '': case '': case '': case '': case '': case '': return 'e'; case '?': case '': return 'g'; case '': case '': return 'h'; case '': case 'i': case '': case '': case '': case '': case '': return 'i'; case '': return 'j'; case '': return 'l'; case '': case '': case '': return 'n'; case '': case '': case '': case '': return 'o'; case '': case '': return 'r'; case '': case '?': case '': case '': return 's'; case '': return 't'; case '': case '': case '': case '': case '': case '': case '': return 'u'; case '': return 'y'; case '': case '': case '': case '': return 'z'; } return letter; } @Override public Iterator<String> getWords() { return baseLexicon.getWords(); } }