com.cybozu.labs.langdetect.CommandLineInterface.java Source code

Java tutorial

Introduction

Here is the source code for com.cybozu.labs.langdetect.CommandLineInterface.java

Source

/*
 * Copyright 2011 Nakatani Shuyo
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cybozu.labs.langdetect;

import be.frma.langguess.LangProfileWriter;
import com.cybozu.labs.langdetect.util.LangProfile;
import com.google.common.base.Optional;
import com.optimaize.langdetect.DetectedLanguage;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

import java.io.*;
import java.util.*;

/**
 * LangDetect Command Line Interface.
 *
 * <p>This is a command line interface of Language Detection Library "LangDetect".</p>
 *
 * <p>Renamed: this class was previously known as "Command".</p>
 *
 * <p>TODO after my recent changes switching to the new Detector this code is untested. -Fabian</p>
 *
 * @author Nakatani Shuyo
 * @author Francois ROLAND
 * @author Fabian Kessler
 */
public class CommandLineInterface {

    /** smoothing default parameter (ELE) */
    private static final double DEFAULT_ALPHA = 0.5;

    /** for Command line easy parser */
    private final Map<String, String> opt_with_value = new HashMap<>();
    private final Map<String, String> values = new HashMap<>();
    private final Set<String> opt_without_value = new HashSet<>();
    private final List<String> arglist = new ArrayList<>();

    /**
     * Command Line Interface
     * @param args command line arguments
     */
    public static void main(String[] args) throws IOException {
        CommandLineInterface cli = new CommandLineInterface();
        cli.addOpt("-d", "directory", "./");
        cli.addOpt("-a", "alpha", "" + DEFAULT_ALPHA);
        cli.addOpt("-s", "seed", null);
        cli.parse(args);

        if (cli.hasParam("--genprofile")) {
            cli.generateProfile();
        } else if (cli.hasParam("--detectlang")) {
            cli.detectLang();
        } else if (cli.hasParam("--batchtest")) {
            cli.batchTest();
        }
    }

    /**
     * Command line easy parser
     * @param args command line arguments
     */
    private void parse(String[] args) {
        for (int i = 0; i < args.length; i++) {
            if (opt_with_value.containsKey(args[i])) {
                String key = opt_with_value.get(args[i]);
                values.put(key, args[i + 1]);
                i++;
            } else if (args[i].startsWith("-")) {
                opt_without_value.add(args[i]);
            } else {
                arglist.add(args[i]);
            }
        }
    }

    private void addOpt(String opt, String key, String value) {
        opt_with_value.put(opt, key);
        values.put(key, value);
    }

    @NotNull
    private String requireParamString(@NotNull String key) {
        String s = values.get(key);
        if (s == null || s.isEmpty()) {
            throw new RuntimeException("Missing command line param: " + key);
        }
        return s;
    }

    /**
     * Returns the double, or the default is absent. Throws if the double is specified but invalid.
     */
    private double getParamDouble(String key, double defaultValue) {
        String value = values.get(key);
        if (value == null || value.isEmpty()) {
            return defaultValue;
        }
        try {
            return Double.valueOf(value);
        } catch (NumberFormatException e) {
            throw new RuntimeException("Invalid double value: >>>" + value + "<<<", e);
        }
    }

    /**
     */
    @Nullable
    private Long getParamLongOrNull(String key) {
        String value = values.get(key);
        if (value == null || value.isEmpty()) {
            return null;
        }
        try {
            return Long.valueOf(value);
        } catch (NumberFormatException e) {
            throw new RuntimeException("Invalid long value: >>>" + value + "<<<", e);
        }
    }

    private boolean hasParam(String opt) {
        return opt_without_value.contains(opt);
    }

    /**
     * File search (easy glob)
     * @param directory directory path
     * @param pattern   searching file pattern with regular representation
     * @return matched file
     */
    private File searchFile(File directory, String pattern) {
        if (!directory.isDirectory()) {
            throw new IllegalArgumentException("Not a directly: " + directory);
        }
        File[] files = directory.listFiles();
        assert files != null; //checked for directly above.
        for (File file : files) {
            if (file.getName().matches(pattern))
                return file;
        }
        return null;
    }

    /**
     * Generate Language Profile from a text file.
     * 
     * <pre>
     * usage: --genprofile [text file] [language name]
     * </pre>
     * 
     */
    public void generateProfile() {
        File directory = new File(arglist.get(0));
        String lang = arglist.get(1);
        File file = searchFile(directory, lang + "wiki-.*-abstract\\.xml.*");
        if (file == null) {
            System.err.println("Not Found text file : lang = " + lang);
            return;
        }

        try (FileOutputStream outputStream = new FileOutputStream(new File(lang))) {
            LangProfile profile = GenProfile.load(lang, file);
            profile.omitLessFreq();
            new LangProfileWriter().write(profile, outputStream);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Language detection test for each file (--detectlang option)
     * 
     * <pre>
     * usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)]
     * </pre>
     * 
     */
    public void detectLang() throws IOException {
        LanguageDetector languageDetector = makeDetector();
        TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();

        for (String filename : arglist) {
            try (BufferedReader is = new BufferedReader(
                    new InputStreamReader(new FileInputStream(filename), "utf-8"))) {
                TextObject textObject = textObjectFactory.create().append(is);
                List<DetectedLanguage> probabilities = languageDetector.getProbabilities(textObject);
                System.out.println(filename + ":" + probabilities);
            }
        }
    }

    /**
     * Batch Test of Language Detection (--batchtest option)
     * 
     * <pre>
     * usage: --batchtest -d [profile directory] -a [alpha] -s [seed] [test data(s)]
     * </pre>
     * 
     * The format of test data(s):
     * <pre>
     *   [correct language name]\t[text body for test]\n
     * </pre>
     *  
     */
    public void batchTest() throws IOException {
        LanguageDetector languageDetector = makeDetector();
        TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();

        Map<String, List<String>> result = new HashMap<>();
        for (String filename : arglist) {
            try (BufferedReader is = new BufferedReader(
                    new InputStreamReader(new FileInputStream(filename), "utf-8"))) {
                while (is.ready()) {
                    String line = is.readLine();
                    int idx = line.indexOf('\t');
                    if (idx <= 0)
                        continue;
                    String correctLang = line.substring(0, idx);
                    String text = line.substring(idx + 1);

                    TextObject textObject = textObjectFactory.forText(text);
                    Optional<LdLocale> lang = languageDetector.detect(textObject);
                    if (!result.containsKey(correctLang))
                        result.put(correctLang, new ArrayList<String>());
                    if (lang.isPresent()) {
                        result.get(correctLang).add(lang.toString());
                    } else {
                        result.get(correctLang).add("unknown");
                    }
                    if (hasParam("--debug"))
                        System.out.println(correctLang + "," + lang + ","
                                + (text.length() > 100 ? text.substring(0, 100) : text));
                }
            }

            List<String> langList = new ArrayList<>(result.keySet());
            Collections.sort(langList);

            int totalCount = 0, totalCorrect = 0;
            for (String lang : langList) {
                Map<String, Integer> resultCount = new HashMap<>();
                int count = 0;
                List<String> list = result.get(lang);
                for (String detectedLang : list) {
                    ++count;
                    if (resultCount.containsKey(detectedLang)) {
                        resultCount.put(detectedLang, resultCount.get(detectedLang) + 1);
                    } else {
                        resultCount.put(detectedLang, 1);
                    }
                }
                int correct = resultCount.containsKey(lang) ? resultCount.get(lang) : 0;
                double rate = correct / (double) count;
                System.out.println(String.format("%s (%d/%d=%.2f): %s", lang, correct, count, rate, resultCount));
                totalCorrect += correct;
                totalCount += count;
            }
            System.out.println(String.format("total: %d/%d = %.3f", totalCorrect, totalCount,
                    totalCorrect / (double) totalCount));
        }
    }

    /**
     * Using all language profiles from the given directory.
     */
    private LanguageDetector makeDetector() throws IOException {
        double alpha = getParamDouble("alpha", DEFAULT_ALPHA);
        String profileDirectory = requireParamString("directory") + "/";
        Optional<Long> seed = Optional.fromNullable(getParamLongOrNull("seed"));

        List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAll(new File(profileDirectory));

        return LanguageDetectorBuilder.create(NgramExtractors.standard()).alpha(alpha).seed(seed)
                .shortTextAlgorithm(50).withProfiles(languageProfiles).build();
    }

}