Java tutorial
/* * Copyright 2014 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.lm.app; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Arrays; import java.util.List; import java.util.zip.GZIPOutputStream; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.tudarmstadt.lt.lm.AbstractLanguageModel; import de.tudarmstadt.lt.lm.service.AbstractStringProvider; import de.tudarmstadt.lt.lm.service.LtSegProvider; /** * * @author Steffen Remus */ public class GenerateNgrams { private static Logger LOG = LoggerFactory.getLogger(GenerateNgrams.class); static void print_usage(Options opts, String message) { if (message != null) System.out.println(message); if (opts != null) new HelpFormatter().printHelp(new PrintWriter(System.out, true), 80, "... " + GenerateNgrams.class.getName() + " <options>", "Options", opts, 2, 2, "======"); System.exit(1); } @SuppressWarnings("static-access") public static void main(String[] args) { Options opts = new Options(); opts.addOption(OptionBuilder.withLongOpt("help").withDescription("Display help message.").create("?")); opts.addOption(OptionBuilder.withLongOpt("ptype").withArgName("class").hasArg().withDescription( "specify the instance of the language model provider that you want to use: {LtSegProvider, BreakIteratorStringProvider, UimaStringProvider, PreTokenizedStringProvider} (default: LtSegProvider)") .create("p")); opts.addOption(OptionBuilder.withLongOpt("cardinality").withArgName("ngram-order").hasArg().withDescription( "Specify the cardinality of the ngrams (min. 1). Specify a range using 'from-to'. (Examples: 5 = extract 5grams; 1-5 = extract 1grams, 2grams, ..., 5grams; default: 1-5).") .create("n")); opts.addOption(OptionBuilder.withLongOpt("dir").withArgName("directory").isRequired().hasArg() .withDescription( "specify the directory that contains '.txt' files that are used as source for generating ngrams.") .create("d")); opts.addOption(OptionBuilder.withLongOpt("overwrite").withDescription("Overwrite existing ngram file.") .create("w")); CommandLine cli = null; try { cli = new GnuParser().parse(opts, args); } catch (Exception e) { print_usage(opts, e.getMessage()); } if (cli.hasOption("?")) print_usage(opts, null); AbstractStringProvider prvdr = null; try { prvdr = StartLM .getStringProviderInstance(cli.getOptionValue("ptype", LtSegProvider.class.getSimpleName())); } catch (Exception e) { print_usage(opts, String.format("Could not instantiate LmProvider '%s': %s", cli.getOptionValue("ptype", LtSegProvider.class.getSimpleName()), e.getMessage())); } String n_ = cli.getOptionValue("cardinality", "1-5"); int dash_index = n_.indexOf('-'); int n_e = Integer.parseInt(n_.substring(dash_index + 1, n_.length()).trim()); int n_b = n_e; if (dash_index == 0) n_b = 1; if (dash_index > 0) n_b = Math.max(1, Integer.parseInt(n_.substring(0, dash_index).trim())); final File src_dir = new File(cli.getOptionValue("dir")); boolean overwrite = Boolean.parseBoolean(cli.getOptionValue("overwrite", "false")); generateNgrams(src_dir, prvdr, n_b, n_e, overwrite); } public static File generateNgrams(File src_dir, AbstractStringProvider prvdr, int from_cardinality, int to_cardinality, boolean overwrite) { final File ngram_file = new File(src_dir, String.format("%s.%s", src_dir.getName(), "ngrams.txt.gz")); int n_b = from_cardinality, n_e = to_cardinality; if (ngram_file.exists()) { LOG.info("Output file already exists: '{}'.", ngram_file.getAbsolutePath()); if (overwrite) { ngram_file.delete(); LOG.info("Overwriting file: '{}'.", ngram_file.getAbsolutePath()); } else return ngram_file; } File[] src_files = src_dir.listFiles(new FileFilter() { @Override public boolean accept(File f) { return f.isFile() && f.getName().endsWith(".txt") && (!f.equals(ngram_file)); } }); String[] basenames = new String[src_files.length]; for (int i = 0; i < basenames.length; i++) basenames[i] = src_files[i].getName(); LOG.info(String.format("Reading txt files from dir: '%s'; Files: %s.", src_dir.getAbsolutePath(), StringUtils.abbreviate(Arrays.toString(basenames), 200))); LOG.info(String.format("Writing ngrams to file: '%s'.", ngram_file.getAbsolutePath())); PrintWriter pw = null; try { pw = new PrintWriter( new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(ngram_file)), "UTF-8")); } catch (IOException e) { LOG.error("Could not open writer for file: '{}'.", ngram_file.getAbsolutePath(), e); return null; } long num_ngrams = 0l; List<String>[] ngrams = null; for (int i = 0; i < src_files.length; i++) { File src_file = src_files[i]; LOG.info("Processing file {} / {} ('{}')", i + 1, src_files.length, src_file.getAbsolutePath()); long num_ngrams_f = 0l; try { LineIterator liter = new LineIterator( new BufferedReader(new InputStreamReader(new FileInputStream(src_file), "UTF-8"))); int lc = 0; while (liter.hasNext()) { if (++lc % 1000 == 0) LOG.debug("Processing line {} ({})", lc, src_file); String line = liter.next(); for (String sentence : prvdr.splitSentences(line)) { for (int n = n_b; n <= n_e; n++) { ngrams = null; try { List<String> tokens = prvdr.tokenizeSentence(sentence); if (tokens.isEmpty()) continue; ngrams = AbstractLanguageModel.getNgramSequence(tokens, n); } catch (Exception e) { LOG.warn( "Could not get ngram of cardinality {} from String '{}' in line '{}' from file '{}'.", n, StringUtils.abbreviate(line, 100), lc, src_file.getAbsolutePath()); continue; } for (List<String> ngram : ngrams) pw.println(StringUtils.join(ngram, " ")); pw.flush(); num_ngrams_f += ngrams.length; } } } liter.close(); } catch (Exception e) { LOG.warn("Could not read file '{}'.", src_file.getAbsolutePath(), e); } LOG.debug("Generated {} ngrams from file {}.", num_ngrams_f, src_file); num_ngrams += num_ngrams_f; } if (pw != null) pw.close(); LOG.info("Generated {} ngrams.", num_ngrams); return ngram_file; } }