Java tutorial
/* Wrapper around Stanford CoreNLP to easily tokenize folders of text files. * Copyright (C) 2015 Hugo m09? Mougard * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package eu.crydee.stanfordcorenlp; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import java.io.IOException; import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Properties; import java.util.stream.Collectors; import net.sourceforge.argparse4j.ArgumentParsers; import net.sourceforge.argparse4j.annotation.Arg; import net.sourceforge.argparse4j.inf.ArgumentParser; import net.sourceforge.argparse4j.inf.ArgumentParserException; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.tuple.Pair; /** * Wrapper around the Stanford CoreNLP suite to tokenize documents. * * @author Hugo m09? Mougard */ public class Tokenizer { /** * StanfordCoreNLP pipeline initialized with tokenize and ssplit as * annotators. */ protected StanfordCoreNLP pipelineWithSS; /** * StanfordCoreNLP pipeline initialized with tokenize as only annotator. */ protected StanfordCoreNLP pipelineWithoutSS; /** * Constructor. */ public Tokenizer() { Properties props1 = new Properties(); props1.setProperty("annotators", "tokenize, ssplit"); pipelineWithSS = new StanfordCoreNLP(props1); Properties props2 = new Properties(); props2.setProperty("annotators", "tokenize"); pipelineWithoutSS = new StanfordCoreNLP(props2); } /** * Tokenize and sentence split. * * @param input the String to tokenize and sentence split. * @return a String of the tokenized text, one sentence per line, with space * separated words. */ public String tokenizeAndSentenceSplit(String input) { Annotation annotation = new Annotation(input); pipelineWithSS.annotate(annotation); return annotation .get(SentencesAnnotation.class).stream().map(s -> s.get(TokensAnnotation.class).stream() .map(t -> t.get(TextAnnotation.class)).collect(Collectors.joining(" "))) .collect(Collectors.joining("\n")); } /** * Tokenize only. * * @param input the String to tokenize. * @return a String of the tokenized text with space separated words. */ public String tokenize(String input) { Annotation annotation = new Annotation(input); pipelineWithoutSS.annotate(annotation); return annotation.get(TokensAnnotation.class).stream().map(ann -> ann.get(TextAnnotation.class)) .collect(Collectors.joining(" ")); } static private class Params { @Arg(dest = "input_dir") public String inDirPath; @Arg(dest = "output_dir") public String outDirpath; } /** * Wrapper around Stanford CoreNLP to tokenize text. * * Give it an input dir of text files with --input-dir and it'll ouput * tokenized versions, one sentence per line with space separated words to * --output-dir (defaults to out/). * * @param args CLI args. Example: --input-dir my-input --output-dir * my-output. */ public static void main(String[] args) { ArgumentParser parser = ArgumentParsers.newArgumentParser("stanford-corenlp-tokenizer-wrapper") .description("Converts Mediawiki dumps to text."); parser.addArgument("-i", "--input-dir").required(true).help("Path of the input text files directory."); parser.addArgument("-o", "--output-dir").help("Path of the output text files directory.").setDefault("out"); Params params = new Params(); try { parser.parseArgs(args, params); } catch (ArgumentParserException ex) { System.err.println("Could not parse arguments: " + ex.getMessage()); System.exit(1); } Tokenizer tokenizer = new Tokenizer(); try { Files.list(Paths.get(params.inDirPath)).filter(Files::isRegularFile).map(Path::toFile).map(f -> { try { return Pair.of(f.getName(), FileUtils.readFileToString(f, StandardCharsets.UTF_8)); } catch (IOException ex) { System.err.println("Could not read input text file: " + ex.getLocalizedMessage()); throw new UncheckedIOException(ex); } }).forEach(p -> { String text = tokenizer.tokenizeAndSentenceSplit(p.getRight()); try { FileUtils.writeStringToFile(Paths.get(params.outDirpath, p.getLeft()).toFile(), text, StandardCharsets.UTF_8); } catch (IOException ex) { System.err.println("Could not write output text file: " + ex.getLocalizedMessage()); } }); } catch (IOException ex) { System.err.println("Could not read from input directory: " + ex.getLocalizedMessage()); } } }