org.zoneproject.extractor.plugin.langdetect.LangDetect.java Source code

Java tutorial

Introduction

Here is the source code for org.zoneproject.extractor.plugin.langdetect.LangDetect.java

Source

package org.zoneproject.extractor.plugin.langdetect;

/*
 * #%L
 * ZONE-plugin-LangDetect
 * %%
 * Copyright (C) 2012 ZONE-project
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * #L%
 */
import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
import java.io.IOException;
import java.io.InputStream;
import java.net.JarURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.IOUtils;

/**
 *
 * @author Desclaux Christophe <christophe@zouig.org>
 */
public class LangDetect {
    public static void init(String profileDirectory) throws LangDetectException, IOException {
        Enumeration<URL> en = Detector.class.getClassLoader().getResources(profileDirectory);
        List<String> profiles = new ArrayList<String>();
        if (en.hasMoreElements()) {
            URL url = en.nextElement();
            JarURLConnection urlcon = (JarURLConnection) url.openConnection();
            JarFile jar = urlcon.getJarFile();
            Enumeration<JarEntry> entries = jar.entries();
            while (entries.hasMoreElements()) {
                String entry = entries.nextElement().getName();
                if (entry.startsWith(profileDirectory)) {
                    InputStream in = Detector.class.getClassLoader().getResourceAsStream(entry);
                    profiles.add(IOUtils.toString(in));
                }
            }
        }
        DetectorFactory.loadProfile(profiles);

    }

    public static ArrayList<Language> detectLangs(String text) throws LangDetectException {
        Detector detector = DetectorFactory.create();
        detector.append(text);
        return detector.getProbabilities();
    }

    public static String detectLang(String text) {
        if (text.length() < 15)
            return "en";

        try {
            double better = 0;
            String betterLang = "en";
            for (Language cur : LangDetect.detectLangs(text)) {
                if (better < cur.prob) {
                    better = cur.prob;
                    betterLang = cur.lang;
                }
            }
            return betterLang;
        } catch (LangDetectException ex) {
            return "en";
        }
    }

    public static void main(String[] args) throws LangDetectException, IOException {
        LangDetect.init("profiles/");
        String text = "Hello my name is descl, i come from ingland. Ce sera un vnement musical.";
        System.out.println(LangDetect.detectLang(text));
    }
}