edu.stanford.nlp.process.PTBTokenizer.java Source code

Introduction

Here is the source code for edu.stanford.nlp.process.PTBTokenizer.java
Source

package edu.stanford.nlp.process;

// Stanford English Tokenizer -- a deterministic, fast high-quality tokenizer

// Copyright (c) 2002-2019 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see http://www.gnu.org/licenses/ .
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 2A
//    Stanford CA 94305-9020
//    USA
//    java-nlp-support@lists.stanford.edu
//    http://nlp.stanford.edu/software/

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

/**
 * A fast, rule-based tokenizer implementation, which produces Penn Treebank
 * style tokenization of English text. It was initially written to conform
 * to Penn Treebank tokenization conventions over ASCII text, but now provides
 * a range of tokenization options over a broader space of Unicode text.
 * It reads raw text and outputs
 * tokens of classes that implement edu.stanford.nlp.trees.HasWord
 * (typically a Word or a CoreLabel). It can
 * optionally return end-of-line as a token.
 * <p>
 * New code is encouraged to use the {@link #PTBTokenizer(Reader,LexedTokenFactory,String)}
 * constructor. The other constructors are historical.
 * You specify the type of result tokens with a
 * LexedTokenFactory, and can specify the treatment of tokens by mainly boolean
 * options given in a comma separated String options
 * (e.g., "invertible,normalizeParentheses=true").
 * If the String is {@code null} or empty, you get the traditional
 * PTB3 normalization behaviour (i.e., you get ptb3Escaping=true).  If you
 * want no normalization, then you should pass in the String
 * "ptb3Escaping=false".  The known option names are:
 * <ol>
 * <li>invertible: Store enough information about the original form of the
 *     token and the whitespace around it that a list of tokens can be
 *     faithfully converted back to the original String.  Valid only if the
 *     LexedTokenFactory is an instance of CoreLabelTokenFactory.  The
 *     keys used in it are: TextAnnotation for the tokenized form,
 *     OriginalTextAnnotation for the original string, BeforeAnnotation and
 *     AfterAnnotation for the whitespace before and after a token, and
 *     perhaps CharacterOffsetBeginAnnotation and CharacterOffsetEndAnnotation to record
 *     token begin/after end character offsets, if they were specified to be recorded
 *     in TokenFactory construction.  (Like the String class, begin and end
 *     are done so end - begin gives the token length.) Default is false.
 * <li>tokenizeNLs: Whether end-of-lines should become tokens (or just
 *     be treated as part of whitespace). Default is false.
 * <li>tokenizePerLine: Run the tokenizer separately on each line of a file.
 *     This has the following consequences: (i) A token (currently only SGML tokens)
 *     cannot span multiple lines of the original input, and (ii) The tokenizer will not
 *     examine/wait for input from the next line before deciding tokenization decisions on
 *     this line. The latter property affects treating periods by acronyms as end-of-sentence
 *     markers. Having this true is necessary to stop the tokenizer blocking and waiting
 *     for input after a newline is seen when the previous line ends with an abbreviation. </li>
 * <li>ptb3Escaping: Enable all traditional PTB3 token transforms
 *     (like parentheses becoming -LRB-, -RRB-).  This is a macro flag that
 *     sets or clears all the options below. Note that because properties are set in a Map,
 *     if you specify both this flag and flags it sets, the resulting behaviour is non-deterministic (sorry!).
 *     (Default setting of the various properties below that this flag controls is equivalent to it being set
 *     to true.)
 * <li>ud: [From CoreNLP 4.0] Enable options that make tokenization like what is used in UD v2. This is a
 *     macro flag that sets various of the options below. It ignores a value for this key.
 *     Note that because properties are set in a Map, if you specify both this flag and flags it sets,
 *     the resulting behaviour is non-deterministic (sorry!).</li>
 * <li>americanize: Whether to rewrite common British English spellings
 *     as American English spellings. (This is useful if your training
 *     material uses American English spelling, such as the Penn Treebank.)
 *     Default is true.
 * <li>normalizeSpace: Whether any spaces in tokens (phone numbers, fractions
 *     get turned into U+00A0 (non-breaking space).  It's dangerous to turn
 *     this off for most of our Stanford NLP software, which assumes no
 *     spaces in tokens. Default is true.
 * <li>normalizeAmpersandEntity: Whether to map the XML {@code &amp;} to an
 *      ampersand. Default is true.
 * <li>normalizeFractions: Whether to map certain common composed
 *     fraction characters to spelled out letter forms like "1/2".
 *     Default is true.
 * <li>normalizeParentheses: Whether to map round parentheses to -LRB-,
 *     -RRB-, as in the Penn Treebank. Default is true.
 * <li>normalizeOtherBrackets: Whether to map other common bracket characters
 *     to -LCB-, -LRB-, -RCB-, -RRB-, roughly as in the Penn Treebank.
 *     Default is true.
 * <li>quotes: [From CoreNLP 4.0] Select a style of mapping quotes. An enum with possible values (case insensitive):
 *     latex, unicode, ascii, not_cp1252, original. "ascii" maps all quote characters to the traditional ' and ".
 *     "latex" maps quotes to ``, `, ', '', as in Latex and the PTB3 WSJ (though this is now heavily frowned on in Unicode).
 *     "unicode" maps quotes to the range U+2018 to U+201D, the preferred unicode encoding of single and double quotes.
 *     "original" leaves all quotes as they were. "not_cp1252" only remaps invalid cp1252 quotes to Unicode.
 *     The default is "not_cp1252". </li>
 * <li>ellipses: [From CoreNLP 4.0] Select a style for mapping ellipses (3 dots).  An enum with possible values
 *     (case insensitive): unicode, ptb3, not_cp1252, original. "ptb3" maps ellipses to three dots (...), the
 *     old PTB3 WSJ coding of an ellipsis. "unicode" maps three dot and space three dot sequences to
 *     U+2026, the Unicode ellipsis character. "not_cp1252" only remaps invalid cp1252 ellipses to unicode.
 *     "original" leaves all ellipses as they were. The default is "not_cp1252". </li>
 * <li>dashes: [From CoreNLP 4.0] Select a style for mapping dashes. An enum with possible values
 *     (case insensitive): unicode, ptb3, not_cp1252, original. "ptb3" maps dashes to "--", the
 *     most prevalent old PTB3 WSJ coding of a dash (though some are just "-" HYPHEN-MINUS).
 *     "unicode" maps "-", "--", and "---" HYPHEN-MINUS sequences and CP1252 dashes to Unicode en and em dashes.
 *     "not_cp1252" only remaps invalid cp1252 dashes to unicode.
 *     "original" leaves all dashes as they were. The default is "not_cp1252". </li>
 * <li>splitAssimilations: true to tokenize "gonna", false to tokenize "gon na".  Default is true. </li>
 * <li>escapeForwardSlashAsterisk: Whether to put a backslash escape in front
 *     of / and * as the old PTB3 WSJ does for some reason (something to do
 *     with Lisp readers??). Default is false. This flag is no longer set
 *     by ptb3Escaping.
 * <li>normalizeCurrency: Whether to do some awful lossy currency mappings
 *     to turn common currency characters into $, #, or "cents", reflecting
 *     the fact that nothing else appears in the old PTB3 WSJ.  (No Euro!)
 *     Default is false. (Note: The default was true through CoreNLP v3.8.0, but we're
 *     gradually inching our way towards the modern world!) This flag is no longer set
 *     by ptb3Escaping.
 * <li>untokenizable: What to do with untokenizable characters (ones not
 *     known to the tokenizer).  Six options combining whether to log a
 *     warning for none, the first, or all, and whether to delete them or
 *     to include them as single character tokens in the output: noneDelete,
 *     firstDelete, allDelete, noneKeep, firstKeep, allKeep.
 *     The default is "firstDelete".
 * <li>strictTreebank3: PTBTokenizer deliberately deviates from strict PTB3
 *      WSJ tokenization in two cases.  Setting this improves compatibility
 *      for those cases.  They are: (i) When an acronym is followed by a
 *      sentence end, such as "U.K." at the end of a sentence, the PTB3
 *      has tokens of "Corp" and ".", while by default PTBTokenizer duplicates
 *      the period returning tokens of "Corp." and ".", and (ii) PTBTokenizer
 *      will return numbers with a whole number and a fractional part like
 *      "5 7/8" as a single token, with a non-breaking space in the middle,
 *      while the PTB3 separates them into two tokens "5" and "7/8".
 *      (Exception: for only "U.S." the treebank does have the two tokens
 *      "U.S." and "." like our default; strictTreebank3 now does that too.)
 *      The default is false.
 *  <li>splitHyphenated: whether or not to tokenize segments of hyphenated words
 *      separately ("school" "-" "aged", "frog" "-" "lipped"), keeping the exceptions
 *      in Supplementary Guidelines for ETTB 2.0 by Justin Mott, Colin Warner, Ann Bies,
 *      Ann Taylor and CLEAR guidelines (Bracketing Biomedical Text) by Colin Warner et al. (2012).
 *      Default is false, which maintains old treebank tokenizer behavior.
 *  <li>splitForwardSlash: [From CoreNLP 4.0] Whether to tokenize segments of slashed tokens separately
 *      ("Asian" "/" "Indian", "and" "/" "or"). Default is false. </li>
 * </ol>
 * <p>
 * A single instance of a PTBTokenizer is not thread safe, as it uses
 * a non-threadsafe JFlex object to do the processing.  Multiple
 * instances can be created safely, though.  A single instance of a
 * PTBTokenizerFactory is also not thread safe, as it keeps its
 * options in a local variable.
 *
 * @author Tim Grow (his tokenizer is a Java implementation of Professor
 *     Chris Manning's Flex tokenizer, pgtt-treebank.l)
 * @author Teg Grenager (grenager@stanford.edu)
 * @author Jenny Finkel (integrating in invertible PTB tokenizer)
 * @author Christopher Manning (redid API, added many options, maintenance)
 */
public class PTBTokenizer<T extends HasWord> extends AbstractTokenizer<T> {

    /** A logger for this class */
    private static final Redwood.RedwoodChannels log = Redwood.channels(PTBTokenizer.class);

    // the underlying lexer
    private final PTBLexer lexer;

    /**
     * Constructs a new PTBTokenizer that returns Word tokens and which treats
     * carriage returns as normal whitespace.
     *
     * @param r The Reader whose contents will be tokenized
     * @return A PTBTokenizer that tokenizes a stream to objects of type
     *          {@link Word}
     */
    public static PTBTokenizer<Word> newPTBTokenizer(Reader r) {
        return new PTBTokenizer<>(r, new WordTokenFactory(), "invertible=false");
    }

    /**
     * Constructs a new PTBTokenizer that makes CoreLabel tokens.
     * It optionally returns carriage returns
     * as their own token. CRs come back as CoreLabels whose text is
     * the value of {@code AbstractTokenizer.NEWLINE_TOKEN}.
     *
     * @param r The Reader to read tokens from
     * @param tokenizeNLs Whether to return newlines as separate tokens
     *         (otherwise they normally disappear as whitespace)
     * @param invertible if set to true, then will produce CoreLabels which
     *         will have fields for the string before and after, and the
     *         character offsets
     * @return A PTBTokenizer which returns CoreLabel objects
     */
    public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) {
        return new PTBTokenizer<>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
    }

    /**
     * Constructs a new PTBTokenizer that optionally returns carriage returns
     * as their own token, and has a custom LexedTokenFactory.
     * If asked for, CRs come back as Words whose text is
     * the value of {@code PTBLexer.cr}.  This constructor translates
     * between the traditional boolean options of PTBTokenizer and the new
     * options String.
     *
     * @param r The Reader to read tokens from
     * @param tokenizeNLs Whether to return newlines as separate tokens
     *         (otherwise they normally disappear as whitespace)
     * @param invertible if set to true, then will produce CoreLabels which
     *         will have fields for the string before and after, and the
     *         character offsets
     * @param suppressEscaping If true, all the traditional Penn Treebank
     *         normalizations are turned off.  Otherwise, they all happen.
     * @param tokenFactory The LexedTokenFactory to use to create
     *         tokens from the text.
     */
    private PTBTokenizer(final Reader r, final boolean tokenizeNLs, final boolean invertible,
            final boolean suppressEscaping, final LexedTokenFactory<T> tokenFactory) {
        StringBuilder options = new StringBuilder();
        if (suppressEscaping) {
            options.append("ptb3Escaping=false");
        } else {
            options.append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations
        }
        if (tokenizeNLs) {
            options.append(",tokenizeNLs");
        }
        if (invertible) {
            options.append(",invertible");
        }
        lexer = new PTBLexer(r, tokenFactory, options.toString());
    }

    /**
     * Constructs a new PTBTokenizer with a custom LexedTokenFactory.
     * Many options for tokenization and what is returned can be set via
     * the options String. See the class documentation for details on
     * the options String.  This is the new recommended constructor!
     *
     * @param r The Reader to read tokens from.
     * @param tokenFactory The LexedTokenFactory to use to create
     *         tokens from the text.
     * @param options Options to the lexer.  See the extensive documentation
     *         in the class javadoc.  The String may be null or empty,
     *         which means that all traditional PTB normalizations are
     *         done.  You can pass in "ptb3Escaping=false" and have no
     *         normalizations done (that is, the behavior of the old
     *         suppressEscaping=true option).
     */
    public PTBTokenizer(final Reader r, final LexedTokenFactory<T> tokenFactory, final String options) {
        lexer = new PTBLexer(r, tokenFactory, options);
    }

    /**
     * Internally fetches the next token.
     *
     * @return the next token in the token stream, or null if none exists.
     */
    @Override
    protected T getNext() {
        // if (lexer == null) {
        //   return null;
        // }
        try {
            return (T) lexer.next();
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
        // cdm 2007: this shouldn't be necessary: PTBLexer decides for itself whether to return CRs based on the same flag!
        // get rid of CRs if necessary
        // while (!tokenizeNLs && PTBLexer.cr.equals(((HasWord) token).word())) {
        //   token = (T)lexer.next();
        // }

        // horatio: we used to catch exceptions here, which led to broken
        // behavior and made it very difficult to debug whatever the
        // problem was.
    }

    /**
     * Returns the string literal inserted for newlines when the -tokenizeNLs
     * options is set.
     *
     * @return string literal inserted for "\n".
     */
    public static String getNewlineToken() {
        return NEWLINE_TOKEN;
    }

    /**
     * Returns a presentable version of the given PTB-tokenized text.
     * PTB tokenization splits up punctuation and does various other things
     * that makes simply joining the tokens with spaces look bad. So join
     * the tokens with space and run it through this method to produce nice
     * looking text. It's not perfect, but it works pretty well.
     * <p>
     * <b>Note:</b> If your tokens have maintained the OriginalTextAnnotation and
     * the BeforeAnnotation and the AfterAnnotation, then rather than doing
     * this you can actually precisely reconstruct the text they were made
     * from!
     *
     * @param ptbText A String in PTB3-escaped form
     * @return An approximation to the original String
     */
    public static String ptb2Text(String ptbText) {
        StringBuilder sb = new StringBuilder(ptbText.length()); // probably an overestimate
        PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
        try {
            for (String token; (token = lexer.next()) != null;) {
                sb.append(token);
            }
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
        return sb.toString();
    }

    /**
     * Returns a presentable version of a given PTB token. For instance,
     * it transforms -LRB- into (.
     */
    public static String ptbToken2Text(String ptbText) {
        return ptb2Text(' ' + ptbText + ' ').trim();
    }

    /**
     * Writes a presentable version of the given PTB-tokenized text.
     * PTB tokenization splits up punctuation and does various other things
     * that makes simply joining the tokens with spaces look bad. So join
     * the tokens with space and run it through this method to produce nice
     * looking text. It's not perfect, but it works pretty well.
     */
    public static int ptb2Text(Reader ptbText, Writer w) throws IOException {
        int numTokens = 0;
        PTB2TextLexer lexer = new PTB2TextLexer(ptbText);
        for (String token; (token = lexer.next()) != null;) {
            numTokens++;
            w.write(token);
        }
        return numTokens;
    }

    private static void untok(List<String> inputFileList, List<String> outputFileList, String charset)
            throws IOException {
        final long start = System.nanoTime();
        int numTokens = 0;
        int sz = inputFileList.size();
        if (sz == 0) {
            Reader r = new InputStreamReader(System.in, charset);
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
            numTokens = ptb2Text(r, writer);
            writer.close();
        } else {
            for (int j = 0; j < sz; j++) {
                try (Reader r = IOUtils.readerFromString(inputFileList.get(j), charset)) {
                    BufferedWriter writer;
                    if (outputFileList == null) {
                        writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
                    } else {
                        writer = new BufferedWriter(
                                new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset));
                    }
                    try {
                        numTokens += ptb2Text(r, writer);
                    } finally {
                        writer.close();
                    }
                }
            }
        }
        final long duration = System.nanoTime() - start;
        final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0);
        System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.%n", numTokens,
                wordsPerSec);
    }

    /**
     * Returns a presentable version of the given PTB-tokenized words.
     * Pass in a List of Strings and this method will
     * join the words with spaces and call {@link #ptb2Text(String)} on the
     * output.
     *
     * @param ptbWords A list of String
     * @return A presentable version of the given PTB-tokenized words
     */
    public static String ptb2Text(List<String> ptbWords) {
        return ptb2Text(StringUtils.join(ptbWords));
    }

    /**
     * Returns a presentable version of the given PTB-tokenized words.
     * Pass in a List of Words or a Document and this method will
     * take the word() values (to prevent additional text from creeping in, e.g., POS tags),
     * and call {@link #ptb2Text(String)} on the output.
     *
     * @param ptbWords A list of HasWord objects
     * @return A presentable version of the given PTB-tokenized words
     */
    public static String labelList2Text(List<? extends HasWord> ptbWords) {
        List<String> words = new ArrayList<>();
        for (HasWord hw : ptbWords) {
            words.add(hw.word());
        }

        return ptb2Text(words);
    }

    private static void tok(List<String> inputFileList, List<String> outputFileList, String charset,
            Pattern parseInsidePattern, Pattern filterPattern, String options, boolean preserveLines,
            boolean oneLinePerElement, boolean dump, boolean lowerCase) throws IOException {
        final long start = System.nanoTime();
        long numTokens = 0;
        int numFiles = inputFileList.size();
        if (numFiles == 0) {
            Reader stdin = IOUtils.readerFromStdin(charset);
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
            numTokens += tokReader(stdin, writer, parseInsidePattern, filterPattern, options, preserveLines,
                    oneLinePerElement, dump, lowerCase);
            IOUtils.closeIgnoringExceptions(writer);

        } else {
            BufferedWriter out = null;
            if (outputFileList == null) {
                out = new BufferedWriter(new OutputStreamWriter(System.out, charset));
            }
            for (int j = 0; j < numFiles; j++) {
                try (Reader r = IOUtils.readerFromString(inputFileList.get(j), charset)) {
                    if (outputFileList != null) {
                        out = new BufferedWriter(
                                new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset));
                    }
                    numTokens += tokReader(r, out, parseInsidePattern, filterPattern, options, preserveLines,
                            oneLinePerElement, dump, lowerCase);
                }
                if (outputFileList != null) {
                    IOUtils.closeIgnoringExceptions(out);
                }
            } // end for j going through inputFileList
            if (outputFileList == null) {
                IOUtils.closeIgnoringExceptions(out);
            }
        }

        final long duration = System.nanoTime() - start;
        final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0);
        System.err.printf("PTBTokenizer tokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
    }

    private static int tokReader(Reader r, BufferedWriter writer, Pattern parseInsidePattern, Pattern filterPattern,
            String options, boolean preserveLines, boolean oneLinePerElement, boolean dump, boolean lowerCase)
            throws IOException {
        int numTokens = 0;
        boolean beginLine = true;
        boolean printing = (parseInsidePattern == null); // start off printing, unless you're looking for a start entity
        Matcher m = null;
        if (parseInsidePattern != null) {
            m = parseInsidePattern.matcher(""); // create once as performance hack
            // System.err.printf("parseInsidePattern is: |%s|%n", parseInsidePattern);
        }
        for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<>(r, new CoreLabelTokenFactory(),
                options); tokenizer.hasNext();) {
            CoreLabel obj = tokenizer.next();
            // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected
            String origStr = obj.get(CoreAnnotations.TextAnnotation.class);
            String str;
            if (lowerCase) {
                str = origStr.toLowerCase(Locale.ENGLISH);
                obj.set(CoreAnnotations.TextAnnotation.class, str);
            } else {
                str = origStr;
            }
            if (m != null && m.reset(origStr).matches()) {
                printing = m.group(1).isEmpty(); // turn on printing if no end element slash, turn it off it there is
                // System.err.printf("parseInsidePattern matched against: |%s|, printing is %b.%n", origStr, printing);
                if (!printing) {
                    // true only if matched a stop
                    beginLine = true;
                    if (oneLinePerElement) {
                        writer.newLine();
                    }
                }
            } else if (printing) {
                if (dump) {
                    // after having checked for tags, change str to be exhaustive
                    str = obj.toShorterString();
                }
                if (filterPattern != null && filterPattern.matcher(origStr).matches()) {
                    // skip
                } else if (preserveLines) {
                    if (NEWLINE_TOKEN.equals(origStr)) {
                        beginLine = true;
                        writer.newLine();
                    } else {
                        if (!beginLine) {
                            writer.write(' ');
                        } else {
                            beginLine = false;
                        }
                        // writer.write(str.replace("\n", ""));
                        writer.write(str);
                    }
                } else if (oneLinePerElement) {
                    if (!beginLine) {
                        writer.write(' ');
                    } else {
                        beginLine = false;
                    }
                    writer.write(str);
                } else {
                    writer.write(str);
                    writer.newLine();
                }
            }
            numTokens++;
        }
        return numTokens;
    }

    /** This is a historical constructor that returns Word tokens.
     *  Note that Word tokens don't support the extra fields to make an invertible tokenizer.
     *
     *  @return A PTBTokenizerFactory that vends Word tokens.
     */
    public static TokenizerFactory<Word> factory() {
        return PTBTokenizerFactory.newTokenizerFactory();
    }

    /** @return A PTBTokenizerFactory that vends CoreLabel tokens. */
    public static TokenizerFactory<CoreLabel> factory(boolean tokenizeNLs, boolean invertible) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeNLs, invertible);
    }

    /** @return A PTBTokenizerFactory that vends CoreLabel tokens with default tokenization. */
    public static TokenizerFactory<CoreLabel> coreLabelFactory() {
        return coreLabelFactory("");
    }

    /** @return A PTBTokenizerFactory that vends CoreLabel tokens with default tokenization. */
    public static TokenizerFactory<CoreLabel> coreLabelFactory(String options) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(new CoreLabelTokenFactory(), options);
    }

    /** Get a TokenizerFactory that does Penn Treebank tokenization.
     *  This is now the recommended factory method to use.
     *
     * @param factory A TokenFactory that determines what form of token is returned by the Tokenizer
     * @param options A String specifying options (see the class javadoc for details)
     * @param <T> The type of the tokens built by the LexedTokenFactory
     * @return A TokenizerFactory that does Penn Treebank tokenization
     */
    public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options) {
        return new PTBTokenizerFactory<>(factory, options);
    }

    /** This class provides a factory which will vend instances of PTBTokenizer
     *  which wrap a provided Reader.  See the documentation for
     *  {@link PTBTokenizer} for details of the parameters and options.
     *
     *  @see PTBTokenizer
     *  @param <T> The class of the returned tokens
     */
    public static class PTBTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> {

        private static final long serialVersionUID = -8859638719818931606L;

        protected final LexedTokenFactory<T> factory;
        protected String options;

        /**
         * Constructs a new TokenizerFactory that returns Word objects and
         * treats carriage returns as normal whitespace.
         * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP
         * CODE TO LOAD A TOKENIZER FACTORY.  IT SHOULD BE PRESENT IN A
         * TokenizerFactory.
         *
         * @return A TokenizerFactory that returns Word objects
         */
        public static TokenizerFactory<Word> newTokenizerFactory() {
            return newPTBTokenizerFactory(new WordTokenFactory(), "invertible=false");
        }

        /**
         * Constructs a new PTBTokenizer that returns Word objects and
         * uses the options passed in.
         * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP
         * CODE TO LOAD A TOKENIZER FACTORY.  IT SHOULD BE PRESENT IN A
         * TokenizerFactory.
         *
         * @param options A String of options
         * @return A TokenizerFactory that returns Word objects
         */
        public static PTBTokenizerFactory<Word> newWordTokenizerFactory(String options) {
            return new PTBTokenizerFactory<>(new WordTokenFactory(), "invertible=false," + options);
        }

        /**
         * Constructs a new PTBTokenizer that returns CoreLabel objects and
         * uses the options passed in.
         *
         * @param options A String of options. For the default, recommended
         *                options for PTB-style tokenization compatibility, pass
         *                in an empty String.
         * @return A TokenizerFactory that returns CoreLabel objects o
         */
        public static PTBTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) {
            return new PTBTokenizerFactory<>(new CoreLabelTokenFactory(), options);
        }

        /**
         * Constructs a new PTBTokenizer that uses the LexedTokenFactory and
         * options passed in.
         *
         * @param tokenFactory The LexedTokenFactory
         * @param options A String of options
         * @return A TokenizerFactory that returns objects of the type of the
         *         LexedTokenFactory
         */
        public static <T extends HasWord> PTBTokenizerFactory<T> newPTBTokenizerFactory(
                LexedTokenFactory<T> tokenFactory, String options) {
            return new PTBTokenizerFactory<>(tokenFactory, options);
        }

        public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean tokenizeNLs,
                boolean invertible) {
            return new PTBTokenizerFactory<>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
        }

        // Constructors

        // This one is historical
        private PTBTokenizerFactory(boolean tokenizeNLs, boolean invertible, boolean suppressEscaping,
                LexedTokenFactory<T> factory) {
            this.factory = factory;
            StringBuilder optionsSB = new StringBuilder();
            if (suppressEscaping) {
                optionsSB.append("ptb3Escaping=false");
            } else {
                optionsSB.append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations
            }
            if (tokenizeNLs) {
                optionsSB.append(",tokenizeNLs");
            }
            if (invertible) {
                optionsSB.append(",invertible");
            }
            this.options = optionsSB.toString();
        }

        /** Make a factory for PTBTokenizers.
         *
         *  @param tokenFactory A factory for the token type that the tokenizer will return
         *  @param options Options to the tokenizer (see the class documentation for details)
         */
        private PTBTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
            this.factory = tokenFactory;
            this.options = options;
        }

        /** Returns a tokenizer wrapping the given Reader. */
        @Override
        public Iterator<T> getIterator(Reader r) {
            return getTokenizer(r);
        }

        /** Returns a tokenizer wrapping the given Reader. */
        @Override
        public Tokenizer<T> getTokenizer(Reader r) {
            return new PTBTokenizer<>(r, factory, options);
        }

        @Override
        public Tokenizer<T> getTokenizer(Reader r, String extraOptions) {
            if (options == null || options.isEmpty()) {
                return new PTBTokenizer<>(r, factory, extraOptions);
            } else {
                return new PTBTokenizer<>(r, factory, options + ',' + extraOptions);
            }
        }

        @Override
        public void setOptions(String options) {
            this.options = options;
        }

    } // end static class PTBTokenizerFactory

    /**
     * Command-line option specification.
     */
    private static Map<String, Integer> optionArgDefs() {
        Map<String, Integer> optionArgDefs = Generics.newHashMap();
        optionArgDefs.put("options", 1);
        optionArgDefs.put("ioFileList", 0);
        optionArgDefs.put("fileList", 0);
        optionArgDefs.put("lowerCase", 0);
        optionArgDefs.put("dump", 0);
        optionArgDefs.put("untok", 0);
        optionArgDefs.put("encoding", 1);
        optionArgDefs.put("parseInside", 1);
        optionArgDefs.put("filter", 1);
        optionArgDefs.put("preserveLines", 0);
        optionArgDefs.put("oneLinePerElement", 0);
        return optionArgDefs;
    }

    /**
     * Reads files given as arguments and print their tokens, by default as
     * one per line.  This is useful either for testing or to run
     * standalone to turn a corpus into a one-token-per-line file of tokens.
     * This main method assumes that the input file is in utf-8 encoding,
     * unless an encoding is specified.
     * <p>
     * Usage: {@code java edu.stanford.nlp.process.PTBTokenizer [options] filename+ }
     * <p>
     * Options:
     * <ul>
     * <li> -options options Set various tokenization options
     *       (see the documentation in the class javadoc).
     * <li> -preserveLines Produce space-separated tokens, except
     *       when the original had a line break, not one-token-per-line.
     * <li> -oneLinePerElement Print the tokens of an element space-separated on one line.
     *       An "element" is either a file or one of the elements matched by the
     *       parseInside regex. </li>
     * <li> -filter regex Delete any token that matches() (in its entirety) the given regex. </li>
     * <li> -encoding encoding Specifies a character encoding. If you do not
     *      specify one, the default is utf-8 (not the platform default).
     * <li> -lowerCase Lowercase all tokens (on tokenization).
     * <li> -parseInside regex Names an XML-style element or a regular expression
     *      over such elements.  The tokenizer will only tokenize inside elements
     *      that match this regex.  (This is done by regex matching, not an XML
     *      parser, but works well for simple XML documents, or other SGML-style
     *      documents, such as Linguistic Data Consortium releases, which adopt
     *      the convention that a line of a file is either XML markup or
     *      character data but never both.)
     * <li> -ioFileList file* The remaining command-line arguments are treated as
     *      filenames that themselves contain lists of pairs of input-output
     *      filenames (2 column, whitespace separated). Alternatively, if there is only
     *      one filename per line, the output filename is the input filename with ".tok" appended.
     * <li> -fileList file* The remaining command-line arguments are treated as
     *      filenames that contain filenames, one per line. The output of tokenization is sent to
     *      stdout.
     * <li> -dump Print the whole of each CoreLabel, not just the value (word).
     * <li> -untok Heuristically untokenize tokenized text.
     * <li> -h, -help Print usage info.
     * </ul>
     * <p>
     * A note on {@code -preserveLines}: Basically, if you use this option, your output file should have
     * the same number of lines as your input file. If not, there is a bug. But the truth of this statement
     * depends on how you count lines. Unicode includes "line separator" and "paragraph separator" characters
     * and Unicode says that you should accept them. See e.g., http://unicode.org/standard/reports/tr13/tr13-5.html
     * <p>
     * However, Unix, Linux utilities, etc. don't recognize them and count only the traditional \n|\r|\r\n.
     * And PTBTokenizer does normalize line separation. Hence, if your input text contains, say U+2028 Line Separator
     * characters, the Unix wc utility will report more lines after tokenization than before,
     * even though line breaks have been preserved, according to Unicode. It may be useful to compare results with the
     * Perl uniwc script from https://raw.githubusercontent.com/briandfoy/Unicode-Tussle/master/script/uniwc
     * <p>
     * If it reports the same number of input and output lines, then this difference is your problem,
     * and in a certain Unicode sense, our tokenizer did indeed preserve the line count.
     * If not, please send us a bug report. At present there is no way to disable this process of Unicode separator
     * characters. If you don't want this anomaly, you'll need to either delete these two characters or to map them
     * to conventional Unix newline characters. Or to some other weirdo character.
     *
     * @param args Command line arguments
     * @throws IOException If any file I/O problem
     */
    public static void main(String[] args) throws IOException {
        Properties options = StringUtils.argsToProperties(args, optionArgDefs());
        boolean showHelp = PropertiesUtils.getBool(options, "help", false);
        showHelp = PropertiesUtils.getBool(options, "h", showHelp);
        if (showHelp) {
            log.info("Usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*");
            log.info("  options: -h|-help|-options tokenizerOptions|-encoding encoding|-dump|");
            log.info("           -lowerCase|-preserveLines|-oneLinePerElement|-filter regex|");
            log.info("           -parseInside regex|-fileList|-ioFileList|-untok");
            return;
        }

        StringBuilder optionsSB = new StringBuilder();
        String tokenizerOptions = options.getProperty("options", null);
        if (tokenizerOptions != null) {
            optionsSB.append(tokenizerOptions);
        }
        boolean preserveLines = PropertiesUtils.getBool(options, "preserveLines", false);
        if (preserveLines) {
            optionsSB.append(",tokenizeNLs");
        }
        boolean oneLinePerElement = PropertiesUtils.getBool(options, "oneLinePerElement", false);
        boolean inputOutputFileList = PropertiesUtils.getBool(options, "ioFileList", false);
        boolean fileList = PropertiesUtils.getBool(options, "fileList", false);
        boolean lowerCase = PropertiesUtils.getBool(options, "lowerCase", false);
        boolean dump = PropertiesUtils.getBool(options, "dump", false);
        boolean untok = PropertiesUtils.getBool(options, "untok", false);
        String charset = options.getProperty("encoding", "utf-8");
        String parseInsideValue = options.getProperty("parseInside", null);
        Pattern parseInsidePattern = null;
        if (parseInsideValue != null) {
            try {
                // We still allow space, but PTBTokenizer will change space to &nbsp; so need to also match it
                parseInsidePattern = Pattern.compile("<(/?)(?:" + parseInsideValue + ")(?:(?:\\s|\u00A0)[^>]*?)?>");
            } catch (PatternSyntaxException e) {
                // just go with null parseInsidePattern
            }
        }
        String filterValue = options.getProperty("filter", null);
        Pattern filterPattern = null;
        if (filterValue != null) {
            try {
                filterPattern = Pattern.compile(filterValue);
            } catch (PatternSyntaxException e) {
                // just go with null filterPattern
            }
        }

        // Other arguments are filenames
        String parsedArgStr = options.getProperty("", null);
        String[] parsedArgs = (parsedArgStr == null) ? null : parsedArgStr.split("\\s+");

        ArrayList<String> inputFileList = new ArrayList<>();
        ArrayList<String> outputFileList = null;
        if (parsedArgs != null) {
            if (fileList || inputOutputFileList) {
                outputFileList = new ArrayList<>();
                for (String fileName : parsedArgs) {
                    BufferedReader r = IOUtils.readerFromString(fileName, charset);
                    for (String inLine; (inLine = r.readLine()) != null;) {
                        String[] fields = inLine.split("\\s+");
                        inputFileList.add(fields[0]);
                        if (fields.length > 1) {
                            outputFileList.add(fields[1]);
                        } else {
                            outputFileList.add(fields[0] + ".tok");
                        }
                    }
                    r.close();
                }
                if (fileList) {
                    // We're not actually going to use the outputFileList!
                    outputFileList = null;
                }
            } else {
                // Concatenate input files into a single output file
                inputFileList.addAll(Arrays.asList(parsedArgs));
            }
        }

        if (untok) {
            untok(inputFileList, outputFileList, charset);
        } else {
            tok(inputFileList, outputFileList, charset, parseInsidePattern, filterPattern, optionsSB.toString(),
                    preserveLines, oneLinePerElement, dump, lowerCase);
        }
    } // end main

} // end PTBTokenizer