Java tutorial
/* * Copyright (C) 2014 Jrg Prante * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program; if not, see http://www.gnu.org/licenses * or write to the Free Software Foundation, Inc., 51 Franklin Street, * Fifth Floor, Boston, MA 02110-1301 USA. * * The interactive user interfaces in modified source and object code * versions of this program must display Appropriate Legal Notices, * as required under Section 5 of the GNU Affero General Public License. * */ package org.xbib.elasticsearch.index.analysis.langdetect; import com.fasterxml.jackson.databind.ObjectMapper; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.component.AbstractLifecycleComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.regex.Pattern; public class LangdetectService extends AbstractLifecycleComponent<LangdetectService> { private final static Pattern word = Pattern.compile("[\\P{IsWord}]", Pattern.UNICODE_CHARACTER_CLASS); private final static String[] DEFAULT_LANGUAGES = new String[] { "af", "ar", "bg", "bn", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw" }; private Map<String, double[]> wordLangProbMap = new HashMap<String, double[]>(); private List<String> langlist = new LinkedList<String>(); private Map<String, String> langmap = new HashMap<String, String>(); private double alpha; private double alpha_width; private int n_trial; private double[] priorMap; private int iteration_limit; private double prob_threshold; private double conv_threshold; private int base_freq; private Pattern filterPattern; @Inject public LangdetectService(Settings settings) { super(settings); } @Override protected void doStart() throws ElasticsearchException { load(settings); this.priorMap = null; this.n_trial = settings.getAsInt("number_of_trials", 7); this.alpha = settings.getAsDouble("alpha", 0.5); this.alpha_width = settings.getAsDouble("alpha_width", 0.05); this.iteration_limit = settings.getAsInt("iteration_limit", 10000); this.prob_threshold = settings.getAsDouble("prob_threshold", 0.1); this.conv_threshold = settings.getAsDouble("conv_threshold", 0.99999); this.base_freq = settings.getAsInt("base_freq", 10000); this.filterPattern = settings.get("pattern") != null ? Pattern.compile(settings.get("pattern"), Pattern.UNICODE_CHARACTER_CLASS) : null; } @Override protected void doStop() throws ElasticsearchException { } @Override protected void doClose() throws ElasticsearchException { } public Settings getSettings() { return settings; } private void load(Settings settings) { try { String[] keys = settings.getAsArray("languages"); if (keys == null || keys.length == 0) { keys = DEFAULT_LANGUAGES; } int index = 0; int size = keys.length; for (String key : keys) { loadProfileFromResource(key, index++, size); } logger.debug("language detection service installed for {}", langlist); } catch (Exception e) { logger.error(e.getMessage(), e); throw new ElasticsearchException(e.getMessage()); } try { // map by settings Settings map = ImmutableSettings.EMPTY; if (settings.getByPrefix("map.") != null) { map = ImmutableSettings.settingsBuilder().put(settings.getByPrefix("map.")).build(); } if (map.getAsMap().isEmpty()) { // is in "map" a resource name? String s = settings.get("map") != null ? settings.get("map") : "/langdetect/language.json"; InputStream in = getClass().getResourceAsStream(s); if (in != null) { map = ImmutableSettings.settingsBuilder().loadFromStream(s, in).build(); } } this.langmap = map.getAsMap(); } catch (Exception e) { logger.error(e.getMessage(), e); throw new ElasticsearchException(e.getMessage()); } } public void loadProfileFromResource(String resource, int index, int langsize) throws IOException { InputStream in = getClass().getResourceAsStream("/langdetect/" + resource); if (in == null) { throw new IOException("profile '" + resource + "' not found"); } ObjectMapper mapper = new ObjectMapper(); LangProfile profile = mapper.readValue(in, LangProfile.class); addProfile(profile, index, langsize); } public void addProfile(LangProfile profile, int index, int langsize) throws IOException { String lang = profile.getName(); if (langlist.contains(lang)) { throw new IOException("duplicate of the same language profile: " + lang); } langlist.add(lang); for (String word : profile.getFreq().keySet()) { if (!wordLangProbMap.containsKey(word)) { wordLangProbMap.put(word, new double[langsize]); } int length = word.length(); if (length >= 1 && length <= 3) { double prob = profile.getFreq().get(word).doubleValue() / profile.getNWords()[length - 1]; wordLangProbMap.get(word)[index] = prob; } } } /** * Set prior information about language probabilities. * * @param priorMap the priorMap to set * @throws LanguageDetectionException */ public void setPriorMap(HashMap<String, Double> priorMap) throws LanguageDetectionException { this.priorMap = new double[langlist.size()]; double sump = 0; for (int i = 0; i < this.priorMap.length; ++i) { String lang = langlist.get(i); if (priorMap.containsKey(lang)) { double p = priorMap.get(lang); if (p < 0) { throw new LanguageDetectionException("Prior probability must be non-negative"); } this.priorMap[i] = p; sump += p; } } if (sump <= 0) { throw new LanguageDetectionException("More one of prior probability must be non-zero"); } for (int i = 0; i < this.priorMap.length; ++i) { this.priorMap[i] /= sump; } } public List<Language> detectAll(String text) throws LanguageDetectionException { List<Language> languages = new ArrayList<Language>(); if (filterPattern != null && !filterPattern.matcher(text).matches()) { return languages; } List<String> list = new ArrayList<String>(); languages = sortProbability(languages, detectBlock(list, text)); return languages.subList(0, Math.min(languages.size(), settings.getAsInt("max", languages.size()))); } private double[] detectBlock(List<String> list, String text) throws LanguageDetectionException { // clean all non-work characters from text text = text.replaceAll(word.pattern(), " "); extractNGrams(list, text); if (list.isEmpty()) { throw new LanguageDetectionException("no features in text"); } double[] langprob = new double[langlist.size()]; Random rand = new Random(); Long seed = 0L; rand.setSeed(seed); for (int t = 0; t < n_trial; ++t) { double[] prob = initProbability(); double a = this.alpha + rand.nextGaussian() * alpha_width; for (int i = 0;; ++i) { int r = rand.nextInt(list.size()); updateLangProb(prob, list.get(r), a); if (i % 5 == 0) { if (normalizeProb(prob) > conv_threshold || i >= iteration_limit) { break; } } } for (int j = 0; j < langprob.length; ++j) { langprob[j] += prob[j] / n_trial; } } return langprob; } private double[] initProbability() { double[] prob = new double[langlist.size()]; if (priorMap != null) { System.arraycopy(priorMap, 0, prob, 0, prob.length); } else { for (int i = 0; i < prob.length; ++i) { prob[i] = 1.0 / langlist.size(); } } return prob; } private void extractNGrams(List<String> list, String text) { NGram ngram = new NGram(); for (int i = 0; i < text.length(); ++i) { ngram.addChar(text.charAt(i)); for (int n = 1; n <= NGram.N_GRAM; ++n) { String w = ngram.get(n); if (w != null && wordLangProbMap.containsKey(w)) { list.add(w); } } } } private boolean updateLangProb(double[] prob, String word, double alpha) { if (word == null || !wordLangProbMap.containsKey(word)) { return false; } double[] langProbMap = wordLangProbMap.get(word); double weight = alpha / base_freq; for (int i = 0; i < prob.length; ++i) { prob[i] *= weight + langProbMap[i]; } return true; } private double normalizeProb(double[] prob) { double maxp = 0, sump = 0; for (double aProb : prob) { sump += aProb; } for (int i = 0; i < prob.length; ++i) { double p = prob[i] / sump; if (maxp < p) { maxp = p; } prob[i] = p; } return maxp; } private List<Language> sortProbability(List<Language> list, double[] prob) { for (int j = 0; j < prob.length; ++j) { double p = prob[j]; if (p > prob_threshold) { for (int i = 0; i <= list.size(); ++i) { if (i == list.size() || list.get(i).getProbability() < p) { String code = langlist.get(j); if (langmap != null && langmap.containsKey(code)) { code = langmap.get(code); } list.add(i, new Language(code, p)); break; } } } } return list; } }