org.xbib.elasticsearch.index.analysis.langdetect.LangdetectService.java Source code

Introduction

Here is the source code for org.xbib.elasticsearch.index.analysis.langdetect.LangdetectService.java
Source

/*
 * Copyright (C) 2014 Jrg Prante
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program; if not, see http://www.gnu.org/licenses
 * or write to the Free Software Foundation, Inc., 51 Franklin Street,
 * Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * The interactive user interfaces in modified source and object code
 * versions of this program must display Appropriate Legal Notices,
 * as required under Section 5 of the GNU Affero General Public License.
 *
 */
package org.xbib.elasticsearch.index.analysis.langdetect;

import com.fasterxml.jackson.databind.ObjectMapper;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Pattern;

public class LangdetectService extends AbstractLifecycleComponent<LangdetectService> {

    private final static Pattern word = Pattern.compile("[\\P{IsWord}]", Pattern.UNICODE_CHARACTER_CLASS);

    private final static String[] DEFAULT_LANGUAGES = new String[] { "af", "ar", "bg", "bn", "cs", "da", "de", "el",
            "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt",
            "lv", "mk", "ml", "mr", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv",
            "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw" };

    private Map<String, double[]> wordLangProbMap = new HashMap<String, double[]>();

    private List<String> langlist = new LinkedList<String>();

    private Map<String, String> langmap = new HashMap<String, String>();

    private double alpha;

    private double alpha_width;

    private int n_trial;

    private double[] priorMap;

    private int iteration_limit;

    private double prob_threshold;

    private double conv_threshold;

    private int base_freq;

    private Pattern filterPattern;

    @Inject
    public LangdetectService(Settings settings) {
        super(settings);
    }

    @Override
    protected void doStart() throws ElasticsearchException {
        load(settings);
        this.priorMap = null;
        this.n_trial = settings.getAsInt("number_of_trials", 7);
        this.alpha = settings.getAsDouble("alpha", 0.5);
        this.alpha_width = settings.getAsDouble("alpha_width", 0.05);
        this.iteration_limit = settings.getAsInt("iteration_limit", 10000);
        this.prob_threshold = settings.getAsDouble("prob_threshold", 0.1);
        this.conv_threshold = settings.getAsDouble("conv_threshold", 0.99999);
        this.base_freq = settings.getAsInt("base_freq", 10000);
        this.filterPattern = settings.get("pattern") != null
                ? Pattern.compile(settings.get("pattern"), Pattern.UNICODE_CHARACTER_CLASS)
                : null;
    }

    @Override
    protected void doStop() throws ElasticsearchException {
    }

    @Override
    protected void doClose() throws ElasticsearchException {
    }

    public Settings getSettings() {
        return settings;
    }

    private void load(Settings settings) {
        try {
            String[] keys = settings.getAsArray("languages");
            if (keys == null || keys.length == 0) {
                keys = DEFAULT_LANGUAGES;
            }
            int index = 0;
            int size = keys.length;
            for (String key : keys) {
                loadProfileFromResource(key, index++, size);
            }
            logger.debug("language detection service installed for {}", langlist);
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
            throw new ElasticsearchException(e.getMessage());
        }
        try {
            // map by settings
            Settings map = ImmutableSettings.EMPTY;
            if (settings.getByPrefix("map.") != null) {
                map = ImmutableSettings.settingsBuilder().put(settings.getByPrefix("map.")).build();
            }
            if (map.getAsMap().isEmpty()) {
                // is in "map" a resource name?
                String s = settings.get("map") != null ? settings.get("map") : "/langdetect/language.json";
                InputStream in = getClass().getResourceAsStream(s);
                if (in != null) {
                    map = ImmutableSettings.settingsBuilder().loadFromStream(s, in).build();
                }
            }
            this.langmap = map.getAsMap();
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
            throw new ElasticsearchException(e.getMessage());
        }
    }

    public void loadProfileFromResource(String resource, int index, int langsize) throws IOException {
        InputStream in = getClass().getResourceAsStream("/langdetect/" + resource);
        if (in == null) {
            throw new IOException("profile '" + resource + "' not found");
        }
        ObjectMapper mapper = new ObjectMapper();
        LangProfile profile = mapper.readValue(in, LangProfile.class);
        addProfile(profile, index, langsize);
    }

    public void addProfile(LangProfile profile, int index, int langsize) throws IOException {
        String lang = profile.getName();
        if (langlist.contains(lang)) {
            throw new IOException("duplicate of the same language profile: " + lang);
        }
        langlist.add(lang);
        for (String word : profile.getFreq().keySet()) {
            if (!wordLangProbMap.containsKey(word)) {
                wordLangProbMap.put(word, new double[langsize]);
            }
            int length = word.length();
            if (length >= 1 && length <= 3) {
                double prob = profile.getFreq().get(word).doubleValue() / profile.getNWords()[length - 1];
                wordLangProbMap.get(word)[index] = prob;
            }
        }
    }

    /**
     * Set prior information about language probabilities.
     *
     * @param priorMap the priorMap to set
     * @throws LanguageDetectionException
     */
    public void setPriorMap(HashMap<String, Double> priorMap) throws LanguageDetectionException {
        this.priorMap = new double[langlist.size()];
        double sump = 0;
        for (int i = 0; i < this.priorMap.length; ++i) {
            String lang = langlist.get(i);
            if (priorMap.containsKey(lang)) {
                double p = priorMap.get(lang);
                if (p < 0) {
                    throw new LanguageDetectionException("Prior probability must be non-negative");
                }
                this.priorMap[i] = p;
                sump += p;
            }
        }
        if (sump <= 0) {
            throw new LanguageDetectionException("More one of prior probability must be non-zero");
        }
        for (int i = 0; i < this.priorMap.length; ++i) {
            this.priorMap[i] /= sump;
        }
    }

    public List<Language> detectAll(String text) throws LanguageDetectionException {
        List<Language> languages = new ArrayList<Language>();
        if (filterPattern != null && !filterPattern.matcher(text).matches()) {
            return languages;
        }
        List<String> list = new ArrayList<String>();
        languages = sortProbability(languages, detectBlock(list, text));
        return languages.subList(0, Math.min(languages.size(), settings.getAsInt("max", languages.size())));
    }

    private double[] detectBlock(List<String> list, String text) throws LanguageDetectionException {
        // clean all non-work characters from text
        text = text.replaceAll(word.pattern(), " ");
        extractNGrams(list, text);
        if (list.isEmpty()) {
            throw new LanguageDetectionException("no features in text");
        }
        double[] langprob = new double[langlist.size()];
        Random rand = new Random();
        Long seed = 0L;
        rand.setSeed(seed);
        for (int t = 0; t < n_trial; ++t) {
            double[] prob = initProbability();
            double a = this.alpha + rand.nextGaussian() * alpha_width;
            for (int i = 0;; ++i) {
                int r = rand.nextInt(list.size());
                updateLangProb(prob, list.get(r), a);
                if (i % 5 == 0) {
                    if (normalizeProb(prob) > conv_threshold || i >= iteration_limit) {
                        break;
                    }
                }
            }
            for (int j = 0; j < langprob.length; ++j) {
                langprob[j] += prob[j] / n_trial;
            }
        }
        return langprob;
    }

    private double[] initProbability() {
        double[] prob = new double[langlist.size()];
        if (priorMap != null) {
            System.arraycopy(priorMap, 0, prob, 0, prob.length);
        } else {
            for (int i = 0; i < prob.length; ++i) {
                prob[i] = 1.0 / langlist.size();
            }
        }
        return prob;
    }

    private void extractNGrams(List<String> list, String text) {
        NGram ngram = new NGram();
        for (int i = 0; i < text.length(); ++i) {
            ngram.addChar(text.charAt(i));
            for (int n = 1; n <= NGram.N_GRAM; ++n) {
                String w = ngram.get(n);
                if (w != null && wordLangProbMap.containsKey(w)) {
                    list.add(w);
                }
            }
        }
    }

    private boolean updateLangProb(double[] prob, String word, double alpha) {
        if (word == null || !wordLangProbMap.containsKey(word)) {
            return false;
        }
        double[] langProbMap = wordLangProbMap.get(word);
        double weight = alpha / base_freq;
        for (int i = 0; i < prob.length; ++i) {
            prob[i] *= weight + langProbMap[i];
        }
        return true;
    }

    private double normalizeProb(double[] prob) {
        double maxp = 0, sump = 0;
        for (double aProb : prob) {
            sump += aProb;
        }
        for (int i = 0; i < prob.length; ++i) {
            double p = prob[i] / sump;
            if (maxp < p) {
                maxp = p;
            }
            prob[i] = p;
        }
        return maxp;
    }

    private List<Language> sortProbability(List<Language> list, double[] prob) {
        for (int j = 0; j < prob.length; ++j) {
            double p = prob[j];
            if (p > prob_threshold) {
                for (int i = 0; i <= list.size(); ++i) {
                    if (i == list.size() || list.get(i).getProbability() < p) {
                        String code = langlist.get(j);
                        if (langmap != null && langmap.containsKey(code)) {
                            code = langmap.get(code);
                        }
                        list.add(i, new Language(code, p));
                        break;
                    }
                }
            }
        }
        return list;
    }

}