eu.crydee.stanfordcorenlp.Tokenizer.java Source code

Introduction

Here is the source code for eu.crydee.stanfordcorenlp.Tokenizer.java
Source

/* Wrapper around Stanford CoreNLP to easily tokenize folders of text files.
 * Copyright (C) 2015  Hugo m09? Mougard
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
 */
package eu.crydee.stanfordcorenlp;

import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Properties;
import java.util.stream.Collectors;
import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.annotation.Arg;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.Pair;

/**
 * Wrapper around the Stanford CoreNLP suite to tokenize documents.
 *
 * @author Hugo m09? Mougard
 */
public class Tokenizer {

    /**
     * StanfordCoreNLP pipeline initialized with tokenize and ssplit as
     * annotators.
     */
    protected StanfordCoreNLP pipelineWithSS;

    /**
     * StanfordCoreNLP pipeline initialized with tokenize as only annotator.
     */
    protected StanfordCoreNLP pipelineWithoutSS;

    /**
     * Constructor.
     */
    public Tokenizer() {
        Properties props1 = new Properties();
        props1.setProperty("annotators", "tokenize, ssplit");
        pipelineWithSS = new StanfordCoreNLP(props1);
        Properties props2 = new Properties();
        props2.setProperty("annotators", "tokenize");
        pipelineWithoutSS = new StanfordCoreNLP(props2);
    }

    /**
     * Tokenize and sentence split.
     *
     * @param input the String to tokenize and sentence split.
     * @return a String of the tokenized text, one sentence per line, with space
     * separated words.
     */
    public String tokenizeAndSentenceSplit(String input) {
        Annotation annotation = new Annotation(input);
        pipelineWithSS.annotate(annotation);
        return annotation
                .get(SentencesAnnotation.class).stream().map(s -> s.get(TokensAnnotation.class).stream()
                        .map(t -> t.get(TextAnnotation.class)).collect(Collectors.joining(" ")))
                .collect(Collectors.joining("\n"));
    }

    /**
     * Tokenize only.
     *
     * @param input the String to tokenize.
     * @return a String of the tokenized text with space separated words.
     */
    public String tokenize(String input) {
        Annotation annotation = new Annotation(input);
        pipelineWithoutSS.annotate(annotation);
        return annotation.get(TokensAnnotation.class).stream().map(ann -> ann.get(TextAnnotation.class))
                .collect(Collectors.joining(" "));
    }

    static private class Params {

        @Arg(dest = "input_dir")
        public String inDirPath;

        @Arg(dest = "output_dir")
        public String outDirpath;
    }

    /**
     * Wrapper around Stanford CoreNLP to tokenize text.
     *
     * Give it an input dir of text files with --input-dir and it'll ouput
     * tokenized versions, one sentence per line with space separated words to
     * --output-dir (defaults to out/).
     *
     * @param args CLI args. Example: --input-dir my-input --output-dir
     * my-output.
     */
    public static void main(String[] args) {
        ArgumentParser parser = ArgumentParsers.newArgumentParser("stanford-corenlp-tokenizer-wrapper")
                .description("Converts Mediawiki dumps to text.");
        parser.addArgument("-i", "--input-dir").required(true).help("Path of the input text files directory.");
        parser.addArgument("-o", "--output-dir").help("Path of the output text files directory.").setDefault("out");
        Params params = new Params();
        try {
            parser.parseArgs(args, params);
        } catch (ArgumentParserException ex) {
            System.err.println("Could not parse arguments: " + ex.getMessage());
            System.exit(1);
        }
        Tokenizer tokenizer = new Tokenizer();

        try {
            Files.list(Paths.get(params.inDirPath)).filter(Files::isRegularFile).map(Path::toFile).map(f -> {
                try {
                    return Pair.of(f.getName(), FileUtils.readFileToString(f, StandardCharsets.UTF_8));
                } catch (IOException ex) {
                    System.err.println("Could not read input text file: " + ex.getLocalizedMessage());
                    throw new UncheckedIOException(ex);
                }
            }).forEach(p -> {
                String text = tokenizer.tokenizeAndSentenceSplit(p.getRight());
                try {
                    FileUtils.writeStringToFile(Paths.get(params.outDirpath, p.getLeft()).toFile(), text,
                            StandardCharsets.UTF_8);
                } catch (IOException ex) {
                    System.err.println("Could not write output text file: " + ex.getLocalizedMessage());
                }
            });
        } catch (IOException ex) {
            System.err.println("Could not read from input directory: " + ex.getLocalizedMessage());
        }
    }
}