de.tudarmstadt.lt.lm.app.GenerateNgrams.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.lt.lm.app.GenerateNgrams.java

Source

/*
 *   Copyright 2014
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.lm.app;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.tudarmstadt.lt.lm.AbstractLanguageModel;
import de.tudarmstadt.lt.lm.service.AbstractStringProvider;
import de.tudarmstadt.lt.lm.service.LtSegProvider;

/**
 *
 * @author Steffen Remus
 */
public class GenerateNgrams {

    private static Logger LOG = LoggerFactory.getLogger(GenerateNgrams.class);

    static void print_usage(Options opts, String message) {
        if (message != null)
            System.out.println(message);
        if (opts != null)
            new HelpFormatter().printHelp(new PrintWriter(System.out, true), 80,
                    "... " + GenerateNgrams.class.getName() + " <options>", "Options", opts, 2, 2, "======");
        System.exit(1);
    }

    @SuppressWarnings("static-access")
    public static void main(String[] args) {
        Options opts = new Options();
        opts.addOption(OptionBuilder.withLongOpt("help").withDescription("Display help message.").create("?"));
        opts.addOption(OptionBuilder.withLongOpt("ptype").withArgName("class").hasArg().withDescription(
                "specify the instance of the language model provider that you want to use: {LtSegProvider, BreakIteratorStringProvider, UimaStringProvider, PreTokenizedStringProvider} (default: LtSegProvider)")
                .create("p"));
        opts.addOption(OptionBuilder.withLongOpt("cardinality").withArgName("ngram-order").hasArg().withDescription(
                "Specify the cardinality of the ngrams (min. 1). Specify a range using 'from-to'. (Examples: 5 = extract 5grams; 1-5 = extract 1grams, 2grams, ..., 5grams; default: 1-5).")
                .create("n"));
        opts.addOption(OptionBuilder.withLongOpt("dir").withArgName("directory").isRequired().hasArg()
                .withDescription(
                        "specify the directory that contains '.txt' files that are used as source for generating ngrams.")
                .create("d"));
        opts.addOption(OptionBuilder.withLongOpt("overwrite").withDescription("Overwrite existing ngram file.")
                .create("w"));

        CommandLine cli = null;
        try {
            cli = new GnuParser().parse(opts, args);
        } catch (Exception e) {
            print_usage(opts, e.getMessage());
        }
        if (cli.hasOption("?"))
            print_usage(opts, null);

        AbstractStringProvider prvdr = null;
        try {
            prvdr = StartLM
                    .getStringProviderInstance(cli.getOptionValue("ptype", LtSegProvider.class.getSimpleName()));
        } catch (Exception e) {
            print_usage(opts, String.format("Could not instantiate LmProvider '%s': %s",
                    cli.getOptionValue("ptype", LtSegProvider.class.getSimpleName()), e.getMessage()));
        }

        String n_ = cli.getOptionValue("cardinality", "1-5");
        int dash_index = n_.indexOf('-');
        int n_e = Integer.parseInt(n_.substring(dash_index + 1, n_.length()).trim());
        int n_b = n_e;
        if (dash_index == 0)
            n_b = 1;
        if (dash_index > 0)
            n_b = Math.max(1, Integer.parseInt(n_.substring(0, dash_index).trim()));

        final File src_dir = new File(cli.getOptionValue("dir"));
        boolean overwrite = Boolean.parseBoolean(cli.getOptionValue("overwrite", "false"));

        generateNgrams(src_dir, prvdr, n_b, n_e, overwrite);

    }

    public static File generateNgrams(File src_dir, AbstractStringProvider prvdr, int from_cardinality,
            int to_cardinality, boolean overwrite) {
        final File ngram_file = new File(src_dir, String.format("%s.%s", src_dir.getName(), "ngrams.txt.gz"));
        int n_b = from_cardinality, n_e = to_cardinality;
        if (ngram_file.exists()) {
            LOG.info("Output file already exists: '{}'.", ngram_file.getAbsolutePath());
            if (overwrite) {
                ngram_file.delete();
                LOG.info("Overwriting file: '{}'.", ngram_file.getAbsolutePath());
            } else
                return ngram_file;
        }

        File[] src_files = src_dir.listFiles(new FileFilter() {
            @Override
            public boolean accept(File f) {
                return f.isFile() && f.getName().endsWith(".txt") && (!f.equals(ngram_file));
            }
        });

        String[] basenames = new String[src_files.length];
        for (int i = 0; i < basenames.length; i++)
            basenames[i] = src_files[i].getName();

        LOG.info(String.format("Reading txt files from dir: '%s'; Files: %s.", src_dir.getAbsolutePath(),
                StringUtils.abbreviate(Arrays.toString(basenames), 200)));
        LOG.info(String.format("Writing ngrams to file: '%s'.", ngram_file.getAbsolutePath()));

        PrintWriter pw = null;
        try {
            pw = new PrintWriter(
                    new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(ngram_file)), "UTF-8"));
        } catch (IOException e) {
            LOG.error("Could not open writer for file: '{}'.", ngram_file.getAbsolutePath(), e);
            return null;
        }

        long num_ngrams = 0l;
        List<String>[] ngrams = null;
        for (int i = 0; i < src_files.length; i++) {
            File src_file = src_files[i];
            LOG.info("Processing file {} / {} ('{}')", i + 1, src_files.length, src_file.getAbsolutePath());
            long num_ngrams_f = 0l;

            try {
                LineIterator liter = new LineIterator(
                        new BufferedReader(new InputStreamReader(new FileInputStream(src_file), "UTF-8")));
                int lc = 0;
                while (liter.hasNext()) {
                    if (++lc % 1000 == 0)
                        LOG.debug("Processing line {} ({})", lc, src_file);
                    String line = liter.next();
                    for (String sentence : prvdr.splitSentences(line)) {
                        for (int n = n_b; n <= n_e; n++) {
                            ngrams = null;
                            try {
                                List<String> tokens = prvdr.tokenizeSentence(sentence);
                                if (tokens.isEmpty())
                                    continue;
                                ngrams = AbstractLanguageModel.getNgramSequence(tokens, n);
                            } catch (Exception e) {
                                LOG.warn(
                                        "Could not get ngram of cardinality {} from String '{}' in line '{}' from file '{}'.",
                                        n, StringUtils.abbreviate(line, 100), lc, src_file.getAbsolutePath());
                                continue;
                            }
                            for (List<String> ngram : ngrams)
                                pw.println(StringUtils.join(ngram, " "));
                            pw.flush();
                            num_ngrams_f += ngrams.length;
                        }
                    }
                }
                liter.close();
            } catch (Exception e) {
                LOG.warn("Could not read file '{}'.", src_file.getAbsolutePath(), e);
            }
            LOG.debug("Generated {} ngrams from file {}.", num_ngrams_f, src_file);
            num_ngrams += num_ngrams_f;
        }

        if (pw != null)
            pw.close();

        LOG.info("Generated {} ngrams.", num_ngrams);

        return ngram_file;
    }

}