de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

Source

/*
 *   Copyright 2014
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.lm.app;

import static de.tudarmstadt.lt.utilities.ProcessUtils.run_process;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.tudarmstadt.lt.lm.service.AbstractStringProvider;
import de.tudarmstadt.lt.lm.service.LtSegProvider;
import de.tudarmstadt.lt.lm.util.Properties;
import de.tudarmstadt.lt.utilities.ArrayUtils;
import de.tudarmstadt.lt.utilities.cli.CliUtils;
import de.tudarmstadt.lt.utilities.cli.ExtendedGnuParser;

/**
 *
 * @author Steffen Remus
 */
public class GenerateNgramIndex implements Runnable {

    private static Logger LOG = LoggerFactory.getLogger(GenerateNgramIndex.class);
    private final static String USAGE_HEADER = "Options:";

    private static File py_src = new File(Properties.pythonBinariesDirectory());

    public static void main(String[] args) throws IOException {
        new GenerateNgramIndex(args).run();
    }

    /**
     * 
     */
    public GenerateNgramIndex() {
        /* default constructor, pass options via new Ngrams(){{ param=value; ... }} */ }

    @SuppressWarnings("static-access")
    public GenerateNgramIndex(String[] args) {

        Options opts = new Options();
        opts.addOption(OptionBuilder.withLongOpt("help").withDescription("Display help message.").create("?"));
        opts.addOption(OptionBuilder.withLongOpt("mincount").withArgName("int").hasArg().withDescription(
                "specify the number of times an ngram must occur to be considered in further calculations. Ngrams with counts below mincount are filtered. (default: 1).")
                .create("m"));
        opts.addOption(OptionBuilder.withLongOpt("overwrite").hasOptionalArg().withArgName("{true|false}")
                .withDescription("Overwrite existing index (default: false).").create("w"));

        // options parsed and passed to Ngrams app
        opts.addOption(OptionBuilder.withLongOpt("ptype").withArgName("class").hasArg().withDescription(
                "specify the instance of the language model provider that you want to use: {LtSegProvider, BreakIteratorStringProvider, UimaStringProvider, PreTokenizedStringProvider} (default: LtSegProvider)")
                .create("p"));
        opts.addOption(OptionBuilder.withLongOpt("cardinality").withArgName("ngram-order").hasArg().withDescription(
                "Specify the cardinality of the ngrams (min. 1). Specify a range using 'from-to'. (Examples: 5 = extract 5grams; 1-5 = extract 1grams, 2grams, ..., 5grams; default: 1-5).")
                .create("n"));
        opts.addOption(OptionBuilder.withLongOpt("file").withArgName("filename").hasArg().withDescription(
                "Specify the file or directory that contains '.txt' file to read from. Specify '-' to read from stdin. (Note: when no output directory is provided this parameter must refer to a directory!)")
                .isRequired().create("f"));
        opts.addOption(OptionBuilder.withLongOpt("dest").withArgName("directory").hasArg().withDescription(
                "Specify the index directory for the related language model files. (default: <srcdir>/.lmindex)")
                .create("d"));
        opts.addOption(OptionBuilder.withLongOpt("accross_sentences").hasOptionalArg().withArgName("{true|false}")
                .withDescription("Generate Ngrams across sentence boundaries.").create("a"));

        try {
            CommandLine cmd = new ExtendedGnuParser(true).parse(opts, args);
            if (cmd.hasOption("help"))
                CliUtils.print_usage_quit(System.err, Ngrams.class.getSimpleName(), opts, USAGE_HEADER, null, 0);

            _provider_type_ = cmd.getOptionValue("ptype", LtSegProvider.class.getSimpleName());
            _file_ = cmd.getOptionValue("file", "-");
            _out_ = cmd.getOptionValue("dest");

            if (_out_ != null)
                _index_dir = new File(_out_);
            else {
                if ("-".equals(_file_))
                    throw new IllegalArgumentException(
                            "Output directory must be provided when generating ngrams from stdin. Please provide a destination directory or generate ngrams from directory.");
                File f_or_dir = new File(_file_);
                if (!f_or_dir.exists())
                    throw new IllegalArgumentException(String.format("Input file or directory '%s' does not exist.",
                            f_or_dir.getAbsolutePath()));
                if (f_or_dir.isFile())
                    throw new IllegalArgumentException(
                            "Output directory must be provided when generating ngrams from file. Please provide a destination directory or generate ngrams from directory.");
                if (f_or_dir.isDirectory())
                    _index_dir = new File(f_or_dir, ".lmindex");
            }

            String order = cmd.getOptionValue("cardinality", "1-5");
            _accross_sentences_ = cmd.hasOption("accross_sentences");
            if (_accross_sentences_ && cmd.getOptionValue("accross_sentences") != null)
                _accross_sentences_ = Boolean.parseBoolean(cmd.getOptionValue("accross_sentences"));

            int dash_index = order.indexOf('-');
            _order_to_ = Integer.parseInt(order.substring(dash_index + 1, order.length()).trim());
            _order_from_ = _order_to_;
            if (dash_index == 0)
                _order_from_ = 1;
            if (dash_index > 0)
                _order_from_ = Math.max(1, Integer.parseInt(order.substring(0, dash_index).trim()));

            _overwrite = cmd.hasOption("overwrite");
            if (_overwrite && cmd.getOptionValue("overwrite") != null)
                _overwrite = Boolean.parseBoolean(cmd.getOptionValue("overwrite"));

            _mincount = Integer.parseInt(cmd.getOptionValue("mincount", "1"));

        } catch (Exception e) {
            CliUtils.print_usage_quit(System.err, Ngrams.class.getSimpleName(), opts, USAGE_HEADER,
                    String.format("%s: %s%n", e.getClass().getSimpleName(), e.getMessage()), 1);
        }

    }

    boolean _overwrite;
    int _mincount;

    File _ngram_file;

    String _provider_type_;
    AbstractStringProvider _provider;
    String _file_;
    String _out_;
    File _index_dir;
    int _order_to_;
    int _order_from_;
    boolean _accross_sentences_;

    /* (non-Javadoc)
     * @see java.lang.Runnable#run()
     */
    @Override
    public void run() {
        if (!_index_dir.exists())
            _index_dir.mkdir();
        _ngram_file = new File(_index_dir, "ngram.raw.txt.gz");
        if (_overwrite && _ngram_file.exists())
            _ngram_file.delete();

        if (!_ngram_file.exists()) {
            new Ngrams() {
                {

                    _provider_type = _provider_type_;
                    _prvdr = _provider;
                    _file = _file_;
                    _out = _ngram_file.getAbsolutePath();
                    _order_from = _order_from_;
                    _order_to = _order_to_;
                    _accross_sentences = _accross_sentences_;

                }
            }.run();
        }

        try {
            generate_index();
        } catch (IOException e) {
            LOG.error("Could not generate Ngram index on file '{}'. {}: {}", _file_, e.getClass().getName(),
                    e.getMessage());
        }

    }

    public static File generate_index(File src_dir, AbstractStringProvider prvdr, int order_from, int order_to,
            int mincount, boolean overwrite_existing_index) throws IOException {
        if (!src_dir.exists())
            throw new IOException(String.format("Input dir '%s' does not exist.", src_dir.getAbsolutePath()));
        GenerateNgramIndex g = new GenerateNgramIndex() {
            {
                _file_ = src_dir.getAbsolutePath();
                _index_dir = new File(src_dir, ".lmindex");
                _provider = prvdr;
                _order_from_ = order_from;
                _order_to_ = order_to;
                _mincount = mincount;
                _overwrite = overwrite_existing_index;
            }
        };
        g.run();
        return g._index_dir;
    }

    public File generate_index() throws IOException {
        if (!_index_dir.exists())
            _index_dir.mkdirs();

        LOG.info(String.format("Creating count files ..."));
        count_ngrams();

        LOG.info(String.format("Creating ngram index from count files ..."));
        LOG.info(String.format("Adding ngrams to index ..."));
        File ngram_joined_counts_file = new File(_index_dir, "ngram.counts.joined.txt.gz");
        create_ngram_index(ngram_joined_counts_file);

        LOG.info(String.format("Adding vocabulary to index ..."));
        File vocab_file = new File(_index_dir, "ngram.vocabulary.txt.gz");
        create_vocabulary_index(vocab_file);
        LOG.info(String.format("Finished."));

        return _index_dir;
    }

    public void count_ngrams() {
        String sort_extra_param = "";
        if (Properties.additionalSortParam() != null && !Properties.additionalSortParam().isEmpty()) {
            sort_extra_param = Properties.additionalSortParam();
            sort_extra_param = sort_extra_param.replace("${indexdir}", _index_dir.getAbsolutePath());
            LOG.info("Sort extra parameter = '{}'", sort_extra_param);
        }
        LOG.info("Current directory: '{}'", new File("").getAbsolutePath());

        String command = null;
        LOG.info(String.format("Counting ngrams."));
        File ngram_count_file = new File(_index_dir, "ngram.counts.txt.gz");
        if (ngram_count_file.exists() && !_overwrite)
            LOG.info("File '{}' already exists.", ngram_count_file);
        else {
            command = String.format(
                    "(export LC_ALL=C; cat %s | gzip -c -d | python mr_ngram_count.py -m | sort %s | python mr_ngram_count.py -r %d | gzip -c > %s)",
                    _ngram_file.getAbsolutePath(), sort_extra_param, _mincount, ngram_count_file.getAbsolutePath());
            run_process(command, py_src, true);
        }

        LOG.info(String.format("Extracting vocabulary."));
        File ngram_vocabulary = new File(_index_dir, "ngram.vocabulary.txt.gz");
        if (ngram_vocabulary.exists() && !_overwrite)
            LOG.info("File '{}' already exists.", ngram_vocabulary.getAbsolutePath());
        else {
            command = String.format(
                    "(export LC_ALL=C; cat %s | gzip -c -d | python mr_ngram_vocab.py -m | sort %s | python mr_ngram_vocab.py -r | gzip -c > %s)",
                    ngram_count_file.getAbsolutePath(), sort_extra_param, ngram_vocabulary.getAbsolutePath());
            run_process(command, py_src, true);
        }

        LOG.info(String.format("Counting N_follow = N(w,x)."));
        File words_following_ngram_count_file = new File(_index_dir, "ngram.counts.nfollow.txt.gz");
        if (words_following_ngram_count_file.exists() && !_overwrite)
            LOG.info("File '{}' already exists.", words_following_ngram_count_file.getAbsolutePath());
        else {
            command = String.format(
                    "(export LC_ALL=C; cat %s | gzip -c -d | python mr_ngram_nfollow.py -m | sort -k1,1 -t$'\t' %s | python mr_ngram_nfollow.py -r | gzip -c > %s)", // just sort is ok too
                    ngram_count_file.getAbsolutePath(), sort_extra_param,
                    words_following_ngram_count_file.getAbsolutePath());
            run_process(command, py_src, true);
        }

        LOG.info(String.format("Counting N_precede = N(x,w)."));
        File words_preceding_ngram_count_file = new File(_index_dir, "ngram.counts.nprecede.txt.gz");
        if (words_preceding_ngram_count_file.exists() && !_overwrite)
            LOG.info("File '{}' already exists.", words_preceding_ngram_count_file.getAbsolutePath());
        else {
            command = String.format(
                    "(export LC_ALL=C; cat %s | gzip -c -d | python mr_ngram_nprecede.py -m | sort -k1,1 -t$'\t' %s | python mr_ngram_nprecede.py -r | gzip -c > %s)", //-k1,1 -k2,2 -t $'\t'
                    ngram_count_file.getAbsolutePath(), sort_extra_param,
                    words_preceding_ngram_count_file.getAbsolutePath());
            run_process(command, py_src, true);
        }

        LOG.info(String.format("Counting N_follower_precede = N(x,ngram,x)."));
        File follower_preceding_count_file = new File(_index_dir, "ngram.counts.nfollowerprecede.txt.gz");
        if (follower_preceding_count_file.exists() && !_overwrite)
            LOG.info("File '{}' already exists.", follower_preceding_count_file.getAbsolutePath());
        else {
            command = String.format(
                    "(export LC_ALL=C; cat %s | gzip -c -d | python mr_ngram_nfollowerprecede.py -m | sort -k1,1 -t$'\t' %s | python mr_ngram_nfollowerprecede.py -r | gzip -c > %s)", //-k1,1 -k2,2 -t $'\t'
                    ngram_count_file.getAbsolutePath(), sort_extra_param,
                    follower_preceding_count_file.getAbsolutePath());
            run_process(command, py_src, true);
        }

        LOG.info(String.format("Joining ngram counts."));
        File ngram_joined_counts_file = new File(_index_dir, "ngram.counts.joined.txt.gz");
        if (ngram_joined_counts_file.exists() && !_overwrite)
            LOG.info("File '{}' already exists.", ngram_joined_counts_file.getAbsolutePath());
        else {
            command = String.format(
                    "(export LC_ALL=C; join -a1 -e '' -1 1 -2 1 -t $'\t' <(cat %s | gzip -c -d) <(cat %s | gzip -c -d) | join -a1 -e '' -1 1 -2 1 -t $'\t' - <(cat %s | gzip -c -d) | join -a1 -e '' -1 1 -2 1 -t $'\t' - <(cat %s | gzip -c -d) | gzip -c > %s)",
                    ngram_count_file.getAbsolutePath(), words_preceding_ngram_count_file.getAbsolutePath(),
                    words_following_ngram_count_file.getAbsolutePath(),
                    follower_preceding_count_file.getAbsolutePath(), ngram_joined_counts_file.getAbsolutePath());
            run_process(command, new File("."), true);
        }

    }

    public void create_ngram_index(File ngram_joined_counts_file) throws IOException {
        File index_dir = new File(_index_dir, "ngram");
        if (index_dir.exists()) {
            LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath());
            if (_overwrite) {
                LOG.info("Overwriting index '{}',", index_dir);
                index_dir.delete();
            } else
                return;
        }
        index_dir.mkdirs();

        Analyzer analyzer = new KeywordAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
        iwc.setOpenMode(OpenMode.CREATE);
        // use 80 percent of the available total memory
        double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6;
        double percentage_ram_buffer = Properties.ramBufferPercentage();
        if (percentage_ram_buffer > 0) {
            double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer;
            LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)",
                    percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb));
            iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb);
        }

        Directory directory = new MMapDirectory(index_dir);
        IndexWriter writer_ngram = new IndexWriter(directory, iwc);

        InputStream in = new FileInputStream(ngram_joined_counts_file);
        if (ngram_joined_counts_file.getName().endsWith(".gz"))
            in = new GZIPInputStream(in);
        LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));

        Document doc = new Document();
        Field f_ngram = new StringField("ngram", "", Store.YES);
        doc.add(f_ngram);
        Field f_n = new IntField("cardinality", 0, Store.YES);
        doc.add(f_n);
        Field f_word = new StringField("word", "", Store.YES);
        doc.add(f_word);
        Field f_hist = new StringField("history", "", Store.YES);
        doc.add(f_hist);
        Field f_lower = new StringField("lower", "", Store.YES);
        doc.add(f_lower);
        Field f_count = new StoredField("num", 0L);
        doc.add(f_count);

        Field[] f_follow = new Field[4];
        f_follow[0] = new StoredField("nf_s", 0L);
        doc.add(f_follow[0]);
        f_follow[1] = new StoredField("nf_N1", 0L);
        doc.add(f_follow[1]);
        f_follow[2] = new StoredField("nf_N2", 0L);
        doc.add(f_follow[2]);
        f_follow[3] = new StoredField("nf_N3", 0L);
        doc.add(f_follow[3]);
        Field[] f_precede = new Field[4];
        f_precede[0] = new StoredField("np_s", 0L);
        doc.add(f_precede[0]);
        f_precede[1] = new StoredField("np_N1", 0L);
        doc.add(f_precede[1]);
        f_precede[2] = new StoredField("np_N2", 0L);
        doc.add(f_precede[2]);
        f_precede[3] = new StoredField("np_N3", 0L);
        doc.add(f_precede[3]);
        Field[] f_followerprecede = new Field[4];
        f_followerprecede[0] = new StoredField("nfp_s", 0L);
        doc.add(f_followerprecede[0]);
        f_followerprecede[1] = new StoredField("nfp_N1", 0L);
        doc.add(f_followerprecede[1]);
        f_followerprecede[2] = new StoredField("nfp_N2", 0L);
        doc.add(f_followerprecede[2]);
        f_followerprecede[3] = new StoredField("nfp_N3", 0L);
        doc.add(f_followerprecede[3]);

        Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } };
        Long[] S = new Long[] { 0L };
        long c = 0;
        while (iter.hasNext()) {
            if (++c % 100000 == 0)
                LOG.info("Adding {}'th ngram.", c);
            String line = iter.next();
            try {
                String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t");
                String ngram_str = splits[0];
                if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) {
                    LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line,
                            ngram_joined_counts_file);
                    continue;
                }

                List<String> ngram = Arrays.asList(ngram_str.split(" "));
                long num = Long.parseLong(splits[1]);
                int n = ngram.size();

                f_ngram.setStringValue(ngram_str);
                f_n.setIntValue(n);
                f_word.setStringValue(ngram.get(ngram.size() - 1));
                f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " "));
                f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " "));
                f_count.setLongValue(num);

                for (int j = 0; j < f_follow.length; j++) {
                    f_follow[j].setLongValue(0L);
                    f_precede[j].setLongValue(0L);
                    f_followerprecede[j].setLongValue(0L);
                }

                if (splits.length > 2 && !splits[2].isEmpty()) {
                    // precede or follow or followerprecede
                    String[] splits_ = splits[2].split(":");
                    String type = splits_[0];
                    String[] count_values = splits_[1].split(",");
                    if (count_values.length > 0) {
                        if ("n_f".equals(type))
                            f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                        else if ("n_p".equals(type))
                            f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                        else if ("n_fp".equals(type))
                            f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                    }
                    for (int i = 1; i < count_values.length; i++) {
                        if ("n_f".equals(type))
                            f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                        else if ("n_p".equals(type))
                            f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                        else if ("n_fp".equals(type))
                            f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                    }
                }
                if (splits.length > 3 && !splits[3].isEmpty()) {
                    // should be follow or followerprecede
                    String[] splits_ = splits[3].split(":");
                    String type = splits_[0];
                    String[] count_values = splits_[1].split(",");
                    if (count_values.length > 0) {
                        if ("n_f".equals(type))
                            f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                        else if ("n_p".equals(type))
                            f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                        else if ("n_fp".equals(type))
                            f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                    }
                    for (int i = 1; i < count_values.length; i++) {
                        if ("n_f".equals(type))
                            f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                        else if ("n_p".equals(type))
                            f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                        else if ("n_fp".equals(type))
                            f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                    }
                }
                if (splits.length > 4 && !splits[4].isEmpty()) {
                    // should be followerprecede
                    String[] splits_ = splits[4].split(":");
                    String type = splits_[0];
                    String[] count_values = splits_[1].split(",");
                    if (count_values.length > 0) {
                        if ("n_f".equals(type))
                            f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                        else if ("n_p".equals(type))
                            f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                        else if ("n_fp".equals(type))
                            f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                    }
                    for (int i = 1; i < count_values.length; i++) {
                        if ("n_f".equals(type))
                            f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                        else if ("n_p".equals(type))
                            f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                        else if ("n_fp".equals(type))
                            f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                    }
                }

                writer_ngram.addDocument(doc);

                while (N.length <= n) {
                    N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } });
                    S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L });
                }

                if (num == 1L)
                    N[n][1]++;
                else if (num == 2L)
                    N[n][2]++;
                else if (num == 3L)
                    N[n][3]++;
                else if (num == 4L)
                    N[n][4]++;
                else
                    N[n][5]++;
                N[n][0]++;
                S[n] += num;

            } catch (Exception e) {
                LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line,
                        ngram_joined_counts_file, c, e);
            }
        }

        writer_ngram.forceMergeDeletes();
        writer_ngram.commit();
        writer_ngram.close();

        StringBuilder b = new StringBuilder(String.format(
                "#%n# Number of times where an ngram occurred: %n#  at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n",
                N.length - 1));
        for (int n = 1; n < N.length; n++)
            b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ',')));
        for (int n = 1; n < S.length; n++)
            b.append(String.format("s%d=%d%n", n, S[n]));
        FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString());

    }

    public void create_vocabulary_index(File vocabulary_file) throws IOException {
        File index_dir = new File(_index_dir, "vocab");
        if (index_dir.exists()) {
            LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath());
            if (_overwrite) {
                LOG.info("Overwriting index '{}',", index_dir);
                index_dir.delete();
            } else
                return;
        }
        index_dir.mkdirs();
        Analyzer analyzer = new KeywordAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setRAMBufferSizeMB(1024.0);
        Directory directory = new MMapDirectory(index_dir);
        IndexWriter writer_vocab = new IndexWriter(directory, iwc);

        InputStream in = new FileInputStream(vocabulary_file);
        if (vocabulary_file.getName().endsWith(".gz"))
            in = new GZIPInputStream(in);
        LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));
        Document doc = new Document();
        Field f_word = new StringField("word", "", Field.Store.YES);
        doc.add(f_word);
        long c = 0;
        while (iter.hasNext()) {
            if (++c % 10000 == 0)
                LOG.info("Adding {}'th word.", c);
            String line = iter.next();
            try {
                String word = line.trim();
                f_word.setStringValue(word);
                writer_vocab.addDocument(doc);
            } catch (Exception e) {
                LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e);
            }
        }

        writer_vocab.forceMergeDeletes();
        writer_vocab.commit();
        writer_vocab.close();
    }

}