Java tutorial
// // StanfordCoreNLP -- a suite of NLP tools. // Copyright (c) 2009-2017 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 2A // Stanford CA 94305-9020 // USA // package edu.stanford.nlp.pipeline; import edu.stanford.nlp.io.FileSequentialCollection; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.objectbank.ObjectBank; import edu.stanford.nlp.util.*; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.util.logging.StanfordRedwoodConfiguration; // import static edu.stanford.nlp.util.logging.Redwood.Util.*; import java.io.*; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Semaphore; import java.util.function.BiConsumer; import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.regex.Pattern; /** * This is a pipeline that takes in a string and returns various analyzed * linguistic forms. * The String is tokenized via a tokenizer (using a TokenizerAnnotator), and * then other sequence model style annotation can be used to add things like * lemmas, POS tags, and named entities. These are returned as a list of CoreLabels. * Other analysis components build and store parse trees, dependency graphs, etc. * * This class is designed to apply multiple Annotators * to an Annotation. The idea is that you first * build up the pipeline by adding Annotators, and then * you take the objects you wish to annotate and pass * them in and get in return a fully annotated object. * At the command-line level you can, e.g., tokenize text with StanfordCoreNLP with a command like: * <br><pre> * java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit -file document.txt * </pre><br> * Please see the package level javadoc for sample usage * and a more complete description. * * The main entry point for the API is StanfordCoreNLP.process() . * * <i>Implementation note:</i> There are other annotation pipelines, but they * don't extend this one. Look for classes that implement Annotator and which * have "Pipeline" in their name. * * @author Jenny Finkel * @author Anna Rafferty * @author Christopher Manning * @author Mihai Surdeanu * @author Steven Bethard */ public class StanfordCoreNLP extends AnnotationPipeline { public enum OutputFormat { TEXT, TAGGED, XML, JSON, CONLL, CONLLU, SERIALIZED, CUSTOM } private static String getDefaultExtension(OutputFormat outputFormat) { switch (outputFormat) { case XML: return ".xml"; case JSON: return ".json"; case CONLL: return ".conll"; case CONLLU: return ".conllu"; case TEXT: return ".out"; case TAGGED: return ".tag"; case SERIALIZED: return ".ser.gz"; case CUSTOM: return ".out"; default: throw new IllegalArgumentException("Unknown output format " + outputFormat); } } /** * An annotator name and its associated signature. * Used in {@link #GLOBAL_ANNOTATOR_CACHE}. */ public static class AnnotatorSignature { public final String name; public final String signature; public AnnotatorSignature(String name, String signature) { this.name = name; this.signature = signature; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; AnnotatorSignature that = (AnnotatorSignature) o; return Objects.equals(name, that.name) && Objects.equals(signature, that.signature); } @Override public int hashCode() { return Objects.hash(name, signature); } @Override public String toString() { return "AnnotatorSignature{name='" + name + "', signature='" + signature + "'}"; } } // end static class AnnotatorSignature /** * A global cache of annotators, so we don't have to re-create one if there's enough memory floating around. */ public static final Map<AnnotatorSignature, Lazy<Annotator>> GLOBAL_ANNOTATOR_CACHE = new ConcurrentHashMap<>(); // other constants public static final String CUSTOM_ANNOTATOR_PREFIX = "customAnnotatorClass."; private static final String PROPS_SUFFIX = ".properties"; public static final String NEWLINE_SPLITTER_PROPERTY = "ssplit.eolonly"; public static final String NEWLINE_IS_SENTENCE_BREAK_PROPERTY = "ssplit.newlineIsSentenceBreak"; public static final String DEFAULT_NEWLINE_IS_SENTENCE_BREAK = "never"; public static final String DEFAULT_OUTPUT_FORMAT = "text"; /** A logger for this class */ private static final Redwood.RedwoodChannels logger = Redwood.channels(StanfordCoreNLP.class); /** Stores the overall number of words processed. */ private int numWords; /** Stores the time (in milliseconds) required to construct the pipeline, for later statistics reporting. */ private final long pipelineSetupTime; /** Properties for this pipeline. Always non-null. */ private final Properties properties; private final Semaphore availableProcessors; /** The annotator pool we should be using to get annotators. */ public final AnnotatorPool pool; /** * Constructs a pipeline using as properties the properties file found in the classpath */ public StanfordCoreNLP() { this((Properties) null); } /** * Construct a basic pipeline. The Properties will be used to determine * which annotators to create, and a default AnnotatorPool will be used * to create the annotators. * */ public StanfordCoreNLP(Properties props) { this(props, (props == null || PropertiesUtils.getBool(props, "enforceRequirements", true))); } public StanfordCoreNLP(Properties props, boolean enforceRequirements) { this(props, enforceRequirements, null); } /** * Constructs a pipeline with the properties read from this file, which must be found in the classpath. * * @param propsFileNamePrefix Filename/resource name of properties file without extension */ public StanfordCoreNLP(String propsFileNamePrefix) { this(propsFileNamePrefix, true); } public StanfordCoreNLP(String propsFileNamePrefix, boolean enforceRequirements) { this(loadPropertiesOrException(propsFileNamePrefix), enforceRequirements); } /** * Construct a CoreNLP with a custom Annotator Pool. */ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorPool annotatorPool) { Timing tim = new Timing(); this.numWords = 0; if (props == null) { // if undefined, find the properties file in the classpath; this method returns non-null (it exceptions if null) props = loadPropertiesFromClasspath(); } else if (props.getProperty("annotators") == null) { // this happens when some command line options are specified (e.g just "-filelist") but no properties file is. // we use the options that are given and let them override the default properties from the class path properties. Properties fromClassPath = loadPropertiesFromClasspath(); fromClassPath.putAll(props); props = fromClassPath; } // handle new fileList by making sure filelist is also set [cdm2018: do in constructor so everyone gets the love] if (props.containsKey("fileList")) { props.setProperty("filelist", props.getProperty("fileList")); } this.properties = props; // from now on we use this.properties // cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations()); // Set threading if (this.properties.containsKey("threads")) { ArgumentParser.threads = PropertiesUtils.getInt(this.properties, "threads"); this.availableProcessors = new Semaphore(ArgumentParser.threads); } else { this.availableProcessors = new Semaphore(1); } // now construct the annotators from the given properties in the given order String[] annoNames = getRequiredProperty(this.properties, "annotators").split("[, \t]+"); Set<String> alreadyAddedAnnoNames = Generics.newHashSet(); Set<Class<? extends CoreAnnotation>> requirementsSatisfied = Generics.newHashSet(); for (String name : annoNames) { name = name.trim(); if (name.isEmpty()) { continue; } logger.info("Adding annotator " + name); Annotator an = pool.get(name); this.addAnnotator(an); if (enforceRequirements) { Set<Class<? extends CoreAnnotation>> allRequirements = an.requires(); for (Class<? extends CoreAnnotation> requirement : allRequirements) { if (!requirementsSatisfied.contains(requirement)) { String fmt = "annotator \"%s\" requires annotation \"%s\". The usual requirements for this annotator are: %s"; throw new IllegalArgumentException(String.format(fmt, name, requirement.getSimpleName(), StringUtils.join(Annotator.DEFAULT_REQUIREMENTS.getOrDefault(name, Collections.singleton("unknown")), ","))); } } requirementsSatisfied.addAll(an.requirementsSatisfied()); } alreadyAddedAnnoNames.add(name); } // Sanity check if (!alreadyAddedAnnoNames.contains(STANFORD_SSPLIT)) { System.setProperty(NEWLINE_SPLITTER_PROPERTY, "false"); } this.pipelineSetupTime = tim.report(); } // // @Override-able methods to change pipeline behavior // /** * Get the implementation of each relevant annotator in the pipeline. * The primary use of this method is to be overwritten by subclasses of StanfordCoreNLP * to call different annotators that obey the exact same contract as the default * annotator. * <p> * The canonical use case for this is as an implementation of the Curator server, * where the annotators make server calls rather than calling each annotator locally. * * @return A class which specifies the actual implementation of each of the annotators called * when creating the annotator pool. The canonical annotators are defaulted to in * {@link edu.stanford.nlp.pipeline.AnnotatorImplementations}. */ protected AnnotatorImplementations getAnnotatorImplementations() { return new AnnotatorImplementations(); } // // property-specific methods // private static String getRequiredProperty(Properties props, String name) { String val = props.getProperty(name); if (val == null) { logger.error("Missing property \"" + name + "\"!"); printRequiredProperties(System.err); throw new RuntimeException("Missing property: \"" + name + '\"'); } return val; } /** * Finds the properties file in the classpath and loads the properties from there. * * @return The found properties object (must be not-null) * @throws RuntimeException If no properties file can be found on the classpath */ private static Properties loadPropertiesFromClasspath() { List<String> validNames = Arrays.asList("StanfordCoreNLP", "edu.stanford.nlp.pipeline.StanfordCoreNLP"); for (String name : validNames) { Properties props = loadProperties(name); if (props != null) return props; } throw new RuntimeException("ERROR: Could not find properties file in the classpath!"); } private static Properties loadPropertiesOrException(String propsFileNamePrefix) { Properties props = loadProperties(propsFileNamePrefix); if (props == null) { throw new RuntimeIOException( "ERROR: cannot find properties file \"" + propsFileNamePrefix + "\" in the classpath!"); } return props; } private static Properties loadProperties(String name) { return loadProperties(name, Thread.currentThread().getContextClassLoader()); } private static Properties loadProperties(String name, ClassLoader loader) { // check if name represents a Stanford CoreNLP supported language if (LanguageInfo.isStanfordCoreNLPSupportedLang(name)) name = LanguageInfo.getLanguagePropertiesFile(name); if (name.endsWith(PROPS_SUFFIX)) name = name.substring(0, name.length() - PROPS_SUFFIX.length()); name = name.replace('.', '/'); name += PROPS_SUFFIX; Properties result = null; // Returns null on lookup failures InputStream in = loader.getResourceAsStream(name); try { if (in != null) { InputStreamReader reader = new InputStreamReader(in, "utf-8"); result = new Properties(); result.load(reader); // Can throw IOException } } catch (IOException e) { result = null; } finally { IOUtils.closeIgnoringExceptions(in); } if (result != null) { logger.info("Searching for resource: " + name + " ... found."); } else { logger.info("Searching for resource: " + name + " ... not found."); } return result; } /** Fetches the Properties object used to construct this Annotator. */ public Properties getProperties() { return properties; } public String getEncoding() { return properties.getProperty("encoding", "UTF-8"); } /** * Take a collection of requested annotators, and produce a list of annotators such that all of the * prerequisites for each of the annotators in the input is met. * For example, if the user requests lemma, ensure that pos is also run because lemma depends on * pos. As a side effect, this function orders the annotators in the proper order. * Note that this is not guaranteed to return a valid set of annotators, * as properties passed to the annotators can change their requirements. * * @param annotators The annotators the user has requested. * @return A sanitized annotators string with all prerequisites met. */ public static String ensurePrerequisiteAnnotators(String[] annotators, Properties props) { // Get an unordered set of annotators Set<String> unorderedAnnotators = new LinkedHashSet<>(); // linked to preserve order Collections.addAll(unorderedAnnotators, annotators); for (String annotator : annotators) { // Add the annotator if (!getNamedAnnotators().containsKey(annotator.toLowerCase())) { throw new IllegalArgumentException("Unknown annotator: " + annotator); } // Add its transitive dependencies unorderedAnnotators.add(annotator.toLowerCase()); if (!Annotator.DEFAULT_REQUIREMENTS.containsKey(annotator.toLowerCase())) { throw new IllegalArgumentException("Cannot infer requirements for annotator: " + annotator); } Queue<String> fringe = new LinkedList<>(Annotator.DEFAULT_REQUIREMENTS.get(annotator.toLowerCase())); int ticks = 0; while (!fringe.isEmpty()) { ticks += 1; if (ticks == 1000000) { throw new IllegalStateException("[INTERNAL ERROR] Annotators have a circular dependency."); } String prereq = fringe.poll(); unorderedAnnotators.add(prereq); fringe.addAll(Annotator.DEFAULT_REQUIREMENTS.get(prereq.toLowerCase())); } } // Order the annotators List<String> orderedAnnotators = new ArrayList<>(); while (!unorderedAnnotators.isEmpty()) { boolean somethingAdded = false; // to make sure the dependencies are satisfiable // Loop over candidate annotators to add Iterator<String> iter = unorderedAnnotators.iterator(); while (iter.hasNext()) { String candidate = iter.next(); // Are the requirements satisfied? boolean canAdd = true; for (String prereq : Annotator.DEFAULT_REQUIREMENTS.get(candidate.toLowerCase())) { if (!orderedAnnotators.contains(prereq)) { canAdd = false; break; } } // If so, add the annotator if (canAdd) { orderedAnnotators.add(candidate); iter.remove(); somethingAdded = true; } } // Make sure we're making progress every iteration, to prevent an infinite loop if (!somethingAdded) { throw new IllegalArgumentException( "Unsatisfiable annotator list: " + StringUtils.join(annotators, ",")); } } // Remove depparse + parse -- these are redundant if (orderedAnnotators.contains(STANFORD_PARSE) && !ArrayUtils.contains(annotators, STANFORD_DEPENDENCIES)) { orderedAnnotators.remove(STANFORD_DEPENDENCIES); } // Tweak the properties, if necessary // (set the mention annotator to use dependency trees, if appropriate) if ((orderedAnnotators.contains(Annotator.STANFORD_COREF_MENTION) || orderedAnnotators.contains(Annotator.STANFORD_COREF)) && !orderedAnnotators.contains(Annotator.STANFORD_PARSE) && !props.containsKey("coref.md.type")) { props.setProperty("coref.md.type", "dep"); } // (ensure regexner is after ner) if (orderedAnnotators.contains(Annotator.STANFORD_NER) && orderedAnnotators.contains(STANFORD_REGEXNER)) { orderedAnnotators.remove(STANFORD_REGEXNER); int nerIndex = orderedAnnotators.indexOf(Annotator.STANFORD_NER); orderedAnnotators.add(nerIndex + 1, STANFORD_REGEXNER); } // (ensure coref is before openie) if (orderedAnnotators.contains(Annotator.STANFORD_COREF) && orderedAnnotators.contains(STANFORD_OPENIE)) { int maxIndex = Math.max(orderedAnnotators.indexOf(STANFORD_OPENIE), orderedAnnotators.indexOf(STANFORD_COREF)); if (Objects.equals(orderedAnnotators.get(maxIndex), STANFORD_OPENIE)) { orderedAnnotators.add(maxIndex, STANFORD_COREF); orderedAnnotators.remove(STANFORD_COREF); } else { orderedAnnotators.add(maxIndex + 1, STANFORD_OPENIE); orderedAnnotators.remove(STANFORD_OPENIE); } } // Return return StringUtils.join(orderedAnnotators, ","); } /** * Check if we can construct an XML outputter. * * @return Whether we can construct an XML outputter. */ private static boolean isXMLOutputPresent() { try { Class.forName("edu.stanford.nlp.pipeline.XMLOutputter"); } catch (ClassNotFoundException | NoClassDefFoundError ex) { return false; } return true; } // // AnnotatorPool construction support // /** * Call this if you are no longer using StanfordCoreNLP and want to * release the memory associated with the annotators. */ public static synchronized void clearAnnotatorPool() { logger.warn("Clearing CoreNLP annotation pool; this should be unnecessary in production"); GLOBAL_ANNOTATOR_CACHE.clear(); } /** * This function defines the list of named annotators in CoreNLP, along with how to construct * them. * * @return A map from annotator name, to the function which constructs that annotator. */ private static Map<String, BiFunction<Properties, AnnotatorImplementations, Annotator>> getNamedAnnotators() { Map<String, BiFunction<Properties, AnnotatorImplementations, Annotator>> pool = new HashMap<>(); pool.put(STANFORD_TOKENIZE, (props, impl) -> impl.tokenizer(props)); pool.put(STANFORD_CLEAN_XML, (props, impl) -> impl.cleanXML(props)); pool.put(STANFORD_SSPLIT, (props, impl) -> impl.wordToSentences(props)); pool.put(STANFORD_MWT, (props, impl) -> impl.multiWordToken(props)); pool.put(STANFORD_DOCDATE, (props, impl) -> impl.docDate(props)); pool.put(STANFORD_POS, (props, impl) -> impl.posTagger(props)); pool.put(STANFORD_LEMMA, (props, impl) -> impl.morpha(props, false)); pool.put(STANFORD_NER, (props, impl) -> impl.ner(props)); pool.put(STANFORD_TOKENSREGEX, (props, impl) -> impl.tokensregex(props, STANFORD_TOKENSREGEX)); pool.put(STANFORD_REGEXNER, (props, impl) -> impl.tokensRegexNER(props, STANFORD_REGEXNER)); pool.put(STANFORD_ENTITY_MENTIONS, (props, impl) -> impl.entityMentions(props, STANFORD_ENTITY_MENTIONS)); pool.put(STANFORD_GENDER, (props, impl) -> impl.gender(props, STANFORD_GENDER)); pool.put(STANFORD_TRUECASE, (props, impl) -> impl.trueCase(props)); pool.put(STANFORD_PARSE, (props, impl) -> impl.parse(props)); pool.put(STANFORD_COREF_MENTION, (props, impl) -> impl.corefMention(props)); pool.put(STANFORD_DETERMINISTIC_COREF, (props, impl) -> impl.dcoref(props)); pool.put(STANFORD_COREF, (props, impl) -> impl.coref(props)); pool.put(STANFORD_RELATION, (props, impl) -> impl.relations(props)); pool.put(STANFORD_SENTIMENT, (props, impl) -> impl.sentiment(props, STANFORD_SENTIMENT)); pool.put(STANFORD_COLUMN_DATA_CLASSIFIER, (props, impl) -> impl.columnData(props)); pool.put(STANFORD_DEPENDENCIES, (props, impl) -> impl.dependencies(props)); pool.put(STANFORD_NATLOG, (props, impl) -> impl.natlog(props)); pool.put(STANFORD_OPENIE, (props, impl) -> impl.openie(props)); pool.put(STANFORD_QUOTE, (props, impl) -> impl.quote(props)); pool.put(STANFORD_QUOTE_ATTRIBUTION, (props, impl) -> impl.quoteattribution(props)); pool.put(STANFORD_UD_FEATURES, (props, impl) -> impl.udfeats(props)); pool.put(STANFORD_LINK, (props, impl) -> impl.link(props)); pool.put(STANFORD_KBP, (props, impl) -> impl.kbp(props)); return pool; } /** * Construct the default annotator pool, and save it as the static annotator pool * for CoreNLP. * * @see StanfordCoreNLP#constructAnnotatorPool(Properties, AnnotatorImplementations) */ public static synchronized AnnotatorPool getDefaultAnnotatorPool(final Properties inputProps, final AnnotatorImplementations annotatorImplementation) { // if the pool already exists reuse! AnnotatorPool pool = AnnotatorPool.SINGLETON; for (Map.Entry<String, BiFunction<Properties, AnnotatorImplementations, Annotator>> entry : getNamedAnnotators() .entrySet()) { AnnotatorSignature key = new AnnotatorSignature(entry.getKey(), PropertiesUtils.getSignature(entry.getKey(), inputProps)); pool.register(entry.getKey(), inputProps, GLOBAL_ANNOTATOR_CACHE.computeIfAbsent(key, (sig) -> Lazy.cache(() -> entry.getValue().apply(inputProps, annotatorImplementation)))); } registerCustomAnnotators(pool, annotatorImplementation, inputProps); return pool; } /** * Register any custom annotators defined in the input properties, and add them to the pool. * * @param pool The annotator pool to add the new custom annotators to. * @param annotatorImplementation The implementation thunk to use to create any new annotators. * @param inputProps The properties to read new annotator definitions from. */ private static void registerCustomAnnotators(AnnotatorPool pool, AnnotatorImplementations annotatorImplementation, Properties inputProps) { // add annotators loaded via reflection from class names specified // in the properties for (String property : inputProps.stringPropertyNames()) { if (property.startsWith(CUSTOM_ANNOTATOR_PREFIX)) { final String customName = property.substring(CUSTOM_ANNOTATOR_PREFIX.length()); final String customClassName = inputProps.getProperty(property); logger.info("Registering annotator " + customName + " with class " + customClassName); AnnotatorSignature key = new AnnotatorSignature(customName, PropertiesUtils.getSignature(customName, inputProps)); pool.register(customName, inputProps, GLOBAL_ANNOTATOR_CACHE.computeIfAbsent(key, (sig) -> Lazy.cache(() -> annotatorImplementation.custom(inputProps, property)))); } } } /** * Construct the default annotator pool from the passed in properties, and overwriting annotators which have changed * since the last call. * * @param inputProps Properties to determine behavior of annotators * @param annotatorImplementation Source of annotator implementations * @return A populated AnnotatorPool */ private static AnnotatorPool constructAnnotatorPool(final Properties inputProps, final AnnotatorImplementations annotatorImplementation) { AnnotatorPool pool = new AnnotatorPool(); for (Map.Entry<String, BiFunction<Properties, AnnotatorImplementations, Annotator>> entry : getNamedAnnotators() .entrySet()) { AnnotatorSignature key = new AnnotatorSignature(entry.getKey(), PropertiesUtils.getSignature(entry.getKey(), inputProps)); pool.register(entry.getKey(), inputProps, GLOBAL_ANNOTATOR_CACHE.computeIfAbsent(key, (sig) -> Lazy.cache(() -> entry.getValue().apply(inputProps, annotatorImplementation)))); } registerCustomAnnotators(pool, annotatorImplementation, inputProps); return pool; } public static synchronized Annotator getExistingAnnotator(String name) { Optional<Annotator> annotator = GLOBAL_ANNOTATOR_CACHE.entrySet().stream() .filter(entry -> name.equals(entry.getKey().name)) .map(entry -> Optional.ofNullable(entry.getValue().getIfDefined())).filter(Optional::isPresent) .map(Optional::get).findFirst(); if (annotator.isPresent()) { return annotator.get(); } else { logger.error("Attempted to fetch annotator \"" + name + "\" but the annotator pool does not store any such type!"); return null; } } /** Annotate the CoreDocument wrapper. **/ public void annotate(CoreDocument document) { // annotate the underlying Annotation this.annotate(document.annotationDocument); // wrap the sentences and entity mentions post annotation document.wrapAnnotations(); } /** {@inheritDoc} */ @Override public void annotate(Annotation annotation) { super.annotate(annotation); List<CoreLabel> words = annotation.get(CoreAnnotations.TokensAnnotation.class); if (words != null) { numWords += words.size(); } } public void annotate(final Annotation annotation, final Consumer<Annotation> callback) { if (PropertiesUtils.getInt(properties, "threads", 1) == 1) { annotate(annotation); callback.accept(annotation); } else { try { availableProcessors.acquire(); } catch (InterruptedException e) { throw new RuntimeInterruptedException(e); } new Thread(() -> { try { annotate(annotation); } catch (Throwable t) { annotation.set(CoreAnnotations.ExceptionAnnotation.class, t); } callback.accept(annotation); availableProcessors.release(); }).start(); } } /** * Determines whether the parser annotator should default to * producing binary trees. Currently there is only one condition * under which this is true: the sentiment annotator is used. */ public static boolean usesBinaryTrees(Properties props) { Set<String> annoNames = Generics .newHashSet(Arrays.asList(props.getProperty("annotators", "").split("[, \t]+"))); return annoNames.contains(STANFORD_SENTIMENT); } /** * Runs the entire pipeline on the content of the given text passed in. * @param text The text to process * @return An Annotation object containing the output of all annotators */ public Annotation process(String text) { Annotation annotation = new Annotation(text); annotate(annotation); return annotation; } // // output and formatting methods (including XML-specific methods) // /** * Displays the output of all annotators in a format easily readable by people. * * @param annotation Contains the output of all annotators * @param os The output stream */ public void prettyPrint(Annotation annotation, OutputStream os) { TextOutputter.prettyPrint(annotation, os, this); } /** * Displays the output of all annotators in a format easily readable by people. * * @param annotation Contains the output of all annotators * @param os The output stream */ public void prettyPrint(Annotation annotation, PrintWriter os) { TextOutputter.prettyPrint(annotation, os, this); } /** * Wrapper around xmlPrint(Annotation, OutputStream). * Added for backward compatibility. * * @param annotation The Annotation to print * @param w The Writer to send the output to * @throws IOException If any IO problem */ public void xmlPrint(Annotation annotation, Writer w) throws IOException { ByteArrayOutputStream os = new ByteArrayOutputStream(); xmlPrint(annotation, os); // this builds it as the encoding specified in the properties w.write(new String(os.toByteArray(), getEncoding())); w.flush(); } /** * Displays the output of all annotators in XML format. * * @param annotation Contains the output of all annotators * @param os The output stream * @throws IOException If any IO problem */ public void xmlPrint(Annotation annotation, OutputStream os) throws IOException { try { Class clazz = Class.forName("edu.stanford.nlp.pipeline.XMLOutputter"); Method method = clazz.getMethod("xmlPrint", Annotation.class, OutputStream.class, StanfordCoreNLP.class); method.invoke(null, annotation, os, this); } catch (NoSuchMethodException | IllegalAccessException | ClassNotFoundException | InvocationTargetException e) { throw new RuntimeException(e); } } /** * Displays the output of all annotators in JSON format. * * @param annotation Contains the output of all annotators * @param w The Writer to send the output to * @throws IOException If any IO problem */ public void jsonPrint(Annotation annotation, Writer w) throws IOException { ByteArrayOutputStream os = new ByteArrayOutputStream(); JSONOutputter.jsonPrint(annotation, os, this); w.write(new String(os.toByteArray(), getEncoding())); w.flush(); } /** * Displays the output of many annotators in CoNLL format. * (Only used by CoreNLPServelet.) * * @param annotation Contains the output of all annotators * @param w The Writer to send the output to * @throws IOException If any IO problem */ public void conllPrint(Annotation annotation, Writer w) throws IOException { ByteArrayOutputStream os = new ByteArrayOutputStream(); CoNLLOutputter.conllPrint(annotation, os, this); w.write(new String(os.toByteArray(), getEncoding())); w.flush(); } // // runtime, shell-specific, and help menu methods // /** * Prints the list of properties required to run the pipeline * @param os PrintStream to print usage to * @param helpTopic a topic to print help about (or null for general options) */ protected static void printHelp(PrintStream os, String helpTopic) { if (helpTopic.toLowerCase().startsWith("pars")) { os.println("StanfordCoreNLP currently supports the following parsers:"); os.println("\tstanford - Stanford lexicalized parser (default)"); os.println("\tcharniak - Charniak and Johnson reranking parser (sold separately)"); os.println(); os.println("General options: (all parsers)"); os.println("\tparse.type - selects the parser to use"); os.println("\tparse.model - path to model file for parser"); os.println("\tparse.maxlen - maximum sentence length"); os.println(); os.println("Stanford Parser-specific options:"); os.println("(In general, you shouldn't need to set this flags)"); os.println("\tparse.flags - extra flags to the parser (default: -retainTmpSubcategories)"); os.println("\tparse.debug - set to true to make the parser slightly more verbose"); os.println(); os.println("Charniak and Johnson parser-specific options:"); os.println("\tparse.executable - path to the parseIt binary or parse.sh script"); } else { // argsToProperties will set the value of a -h or -help to "true" if no arguments are given if (!helpTopic.equalsIgnoreCase("true")) { os.println("Unknown help topic: " + helpTopic); os.println("See -help for a list of all help topics."); } else { printRequiredProperties(os); } } } /** * Prints the list of properties required to run the pipeline * @param os PrintStream to print usage to */ private static void printRequiredProperties(PrintStream os) { // TODO some annotators (ssplit, regexner, gender, some parser options, dcoref?) are not documented os.println("The following properties can be defined:"); os.println( "(if -props or -annotators is not passed in, default properties will be loaded via the classpath)"); os.println("\t\"props\" - path to file with configuration properties"); os.println("\t\"annotators\" - comma separated list of annotators"); os.println( "\tThe following annotators are supported: cleanxml, tokenize, quote, ssplit, pos, lemma, ner, truecase, parse, hcoref, relation"); os.println(); os.println("\tIf annotator \"tokenize\" is defined:"); os.println( "\t\"tokenize.options\" - PTBTokenizer options (see edu.stanford.nlp.process.PTBTokenizer for details)"); os.println("\t\"tokenize.whitespace\" - If true, just use whitespace tokenization"); os.println(); os.println("\tIf annotator \"cleanxml\" is defined:"); os.println("\t\"clean.xmltags\" - regex of tags to extract text from"); os.println("\t\"clean.sentenceendingtags\" - regex of tags which mark sentence endings"); os.println("\t\"clean.allowflawedxml\" - if set to true, don't complain about XML errors"); os.println(); os.println("\tIf annotator \"pos\" is defined:"); os.println("\t\"pos.maxlen\" - maximum length of sentence to POS tag"); os.println("\t\"pos.model\" - path towards the POS tagger model"); os.println(); os.println("\tIf annotator \"ner\" is defined:"); os.println( "\t\"ner.model\" - paths for the ner models. By default, the English 3 class, 7 class, and 4 class models are used."); os.println("\t\"ner.useSUTime\" - Whether or not to use sutime (English specific)"); os.println( "\t\"ner.applyNumericClassifiers\" - whether or not to use any numeric classifiers (English specific)"); os.println(); os.println("\tIf annotator \"truecase\" is defined:"); os.println("\t\"truecase.model\" - path towards the true-casing model; default: " + DefaultPaths.DEFAULT_TRUECASE_MODEL); os.println("\t\"truecase.bias\" - class bias of the true case model; default: " + TrueCaseAnnotator.DEFAULT_MODEL_BIAS); os.println("\t\"truecase.mixedcasefile\" - path towards the mixed case file; default: " + DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST); os.println(); os.println("\tIf annotator \"relation\" is defined:"); os.println("\t\"sup.relation.verbose\" - whether verbose or not"); os.println("\t\"sup.relation.model\" - path towards the relation extraction model"); os.println(); os.println("\tIf annotator \"parse\" is defined:"); os.println("\t\"parse.model\" - path towards the PCFG parser model"); /* XXX: unstable, do not use for now os.println(); os.println("\tIf annotator \"srl\" is defined:"); os.println("\t\"srl.verb.args\" - path to the file listing verbs and their core arguments (\"verbs.core_args\")"); os.println("\t\"srl.model.id\" - path prefix for the role identification model (adds \".model.gz\" and \".fe\" to this prefix)"); os.println("\t\"srl.model.cls\" - path prefix for the role classification model (adds \".model.gz\" and \".fe\" to this prefix)"); os.println("\t\"srl.model.jic\" - path to the directory containing the joint model's \"model.gz\", \"fe\" and \"je\" files"); os.println("\t (if not specified, the joint model will not be used)"); */ os.println(); os.println("Command line properties:"); os.println( "\t\"file\" - run the pipeline on the content of this file, or on the content of the files in this directory"); os.println("\t XML output is generated for every input file \"file\" as file.xml"); os.println("\t\"extension\" - if -file used with a directory, process only the files with this extension"); os.println("\t\"fileList\" - run the pipeline on the list of files given in this file"); os.println("\t output is generated for every input file as file.outputExtension"); os.println("\t\"outputDirectory\" - where to put output (defaults to the current directory)"); os.println( "\t\"outputExtension\" - extension to use for the output file (defaults to \".xml\" for XML, \".ser.gz\" for serialized). Don't forget the dot!"); os.println( "\t\"outputFormat\" - \"text\" (default), \"tagged\", \"json\", \"conll\", \"conllu\", \"serialized\", \"xml\" or \"custom\""); os.println( "\t\"customOutputter\" - specify a class to a custom outputter instead of a pre-defined output format"); os.println( "\t\"serializer\" - Class of annotation serializer to use when outputFormat is \"serialized\". By default, uses ProtobufAnnotationSerializer."); os.println( "\t\"replaceExtension\" - flag to chop off the last extension before adding outputExtension to file"); os.println("\t\"noClobber\" - don't automatically override (clobber) output files that already exist"); os.println( "\t\"isOneDocument\" - (for piped input only) treat the text till eof as one document rather than one document per line"); os.println("\t\"threads\" - multithread on this number of threads"); os.println(); os.println( "If none of the above are present, run the pipeline in an interactive shell (default properties will be loaded from the classpath)."); os.println("The shell accepts input from stdin and displays the output at stdout."); os.println(); os.println("Run with -help [topic] for more help on a specific topic."); os.println("Current topics include: parser"); os.println(); } /** * {@inheritDoc} */ @Override public String timingInformation() { StringBuilder sb = new StringBuilder(super.timingInformation()); if (TIME && numWords >= 0) { long total = this.getTotalTime(); sb.append(" for ").append(this.numWords).append(" tokens at "); sb.append(String.format("%.1f", numWords / (((double) total) / 1000))); sb.append(" tokens/sec."); } return sb.toString(); } /** * Runs as either a filter or as an interactive shell where input text is processed with the given pipeline. * The default case is to treat each line as a document. This can be altered with the property. * * @throws IOException If IO problem with stdin */ private void shell() throws IOException { AnnotationOutputter.Options options = AnnotationOutputter.getOptions(properties); String encoding = getEncoding(); BufferedReader r = new BufferedReader(IOUtils.encodedInputStreamReader(System.in, encoding)); boolean isTty = System.console() != null; boolean oneDocument = Boolean.parseBoolean(properties.getProperty("isOneDocument")); if (isTty) { System.err.println("Entering interactive shell. Type q RETURN or EOF to quit."); } while (true) { if (isTty) { System.err.print("NLP> "); } String line; if (oneDocument) { line = IOUtils.slurpReader(r); } else { line = r.readLine(); } if (line == null || isTty && line.equalsIgnoreCase("q")) { break; } if (!line.isEmpty()) { Annotation anno = process(line); outputAnnotation(System.out, anno, properties, options); } if (oneDocument) { break; } } } protected static Collection<File> readFileList(String fileName) { return ObjectBank.getLineIterator(fileName, new ObjectBank.PathToFileFunction()); } private static AnnotationSerializer loadSerializer(String serializerClass, String name, Properties properties) { AnnotationSerializer serializer; // initialized below try { // Try loading with properties serializer = ReflectionLoading.loadByReflection(serializerClass, name, properties); } catch (ReflectionLoading.ReflectionLoadingException ex) { // Try loading with just default constructor serializer = ReflectionLoading.loadByReflection(serializerClass); } return serializer; } /** * Create an outputter to be passed into {@link StanfordCoreNLP#processFiles(String, Collection, int, Properties, BiConsumer, BiConsumer, OutputFormat, boolean)}. * * @param properties The properties file to use. * * @return A consumer that can be passed into the processFiles method. */ public static BiConsumer<Annotation, OutputStream> createOutputter(Properties properties, AnnotationOutputter.Options options) { return (Annotation annotation, OutputStream fos) -> { try { outputAnnotation(fos, annotation, properties, options); } catch (IOException e) { throw new RuntimeIOException(e); } }; } private static void outputAnnotation(OutputStream fos, Annotation annotation, Properties properties, AnnotationOutputter.Options outputOptions) throws IOException { final OutputFormat outputFormat = OutputFormat .valueOf(properties.getProperty("outputFormat", DEFAULT_OUTPUT_FORMAT).toUpperCase()); switch (outputFormat) { case XML: AnnotationOutputter outputter = MetaClass.create("edu.stanford.nlp.pipeline.XMLOutputter") .createInstance(); outputter.print(annotation, fos, outputOptions); break; case JSON: new JSONOutputter().print(annotation, fos, outputOptions); break; case CONLL: new CoNLLOutputter().print(annotation, fos, outputOptions); break; case TEXT: new TextOutputter().print(annotation, fos, outputOptions); break; case TAGGED: new TaggedTextOutputter().print(annotation, fos, outputOptions); break; case SERIALIZED: final String serializerClass = properties.getProperty("serializer", ProtobufAnnotationSerializer.class.getName()); final String outputSerializerClass = properties.getProperty("outputSerializer", serializerClass); final String outputSerializerName = (serializerClass.equals(outputSerializerClass)) ? "serializer" : "outputSerializer"; if (outputSerializerClass != null) { AnnotationSerializer outputSerializer = loadSerializer(outputSerializerClass, outputSerializerName, properties); outputSerializer.write(annotation, fos); } break; case CONLLU: new CoNLLUOutputter(properties).print(annotation, fos, outputOptions); break; case CUSTOM: AnnotationOutputter customOutputter = ReflectionLoading .loadByReflection(properties.getProperty("customOutputter")); customOutputter.print(annotation, fos, outputOptions); break; default: throw new IllegalArgumentException("Unknown output format " + outputFormat); } } /** * Helper method for printing out timing info after an annotation run * * @param pipeline the StanfordCoreNLP pipeline to log timing info for * @param tim the Timing object to log timing info */ private static void logTimingInfo(StanfordCoreNLP pipeline, Timing tim) { logger.info(""); // puts blank line in logging output logger.info(pipeline.timingInformation()); logger.info("Pipeline setup: " + Timing.toSecondsString(pipeline.pipelineSetupTime) + " sec."); logger.info("Total time for StanfordCoreNLP pipeline: " + Timing.toSecondsString(pipeline.pipelineSetupTime + tim.report()) + " sec."); } /** * Process a collection of files. * * @param base The base input directory to process from. * @param files The files to process. * @param numThreads The number of threads to annotate on. * @param clearPool Whether or not to clear pool when process is done * * @throws IOException */ public void processFiles(String base, final Collection<File> files, int numThreads, boolean clearPool, Optional<Timing> tim) throws IOException { AnnotationOutputter.Options options = AnnotationOutputter.getOptions(properties); StanfordCoreNLP.OutputFormat outputFormat = StanfordCoreNLP.OutputFormat .valueOf(properties.getProperty("outputFormat", DEFAULT_OUTPUT_FORMAT).toUpperCase()); processFiles(base, files, numThreads, properties, this::annotate, createOutputter(properties, options), outputFormat, clearPool, Optional.of(this), tim); } protected static void processFiles(String base, final Collection<File> files, int numThreads, Properties properties, BiConsumer<Annotation, Consumer<Annotation>> annotate, BiConsumer<Annotation, OutputStream> print, OutputFormat outputFormat, boolean clearPool) throws IOException { processFiles(base, files, numThreads, properties, annotate, print, outputFormat, clearPool, Optional.empty(), Optional.empty()); } /** * A common method for processing a set of files, used in both {@link StanfordCoreNLP} as well as * {@link StanfordCoreNLPClient}. * * @param base The base input directory to process from. * @param files The files to process. * @param numThreads The number of threads to annotate on. * @param properties The properties file to use during annotation. * This should match the properties file used in the implementation of the annotate function. * @param annotate The function used to annotate a document. * @param print The function used to print a document. * @param outputFormat The format used for printing out documents * @param clearPool Whether or not to clear the pool when done * @param pipeline the pipeline annotating the objects * @param tim the Timing object for this annotation run * * @throws IOException If any IO problem */ protected static void processFiles(String base, final Collection<File> files, int numThreads, Properties properties, BiConsumer<Annotation, Consumer<Annotation>> annotate, BiConsumer<Annotation, OutputStream> print, OutputFormat outputFormat, boolean clearPool, Optional<StanfordCoreNLP> pipeline, Optional<Timing> tim) throws IOException { // List<Runnable> toRun = new LinkedList<>(); // Process properties here final String baseOutputDir = properties.getProperty("outputDirectory", "."); final String baseInputDir = properties.getProperty("inputDirectory", base); // Set of files to exclude final String excludeFilesParam = properties.getProperty("excludeFiles"); final Set<String> excludeFiles = new HashSet<>(); if (excludeFilesParam != null) { Iterable<String> lines = IOUtils.readLines(excludeFilesParam); for (String line : lines) { String name = line.trim(); if (!name.isEmpty()) excludeFiles.add(name); } } //(file info) final String serializerClass = properties.getProperty("serializer", GenericAnnotationSerializer.class.getName()); final String inputSerializerClass = properties.getProperty("inputSerializer", serializerClass); final String inputSerializerName = (serializerClass.equals(inputSerializerClass)) ? "serializer" : "inputSerializer"; final String extension = properties.getProperty("outputExtension", getDefaultExtension(outputFormat)); final boolean replaceExtension = Boolean.parseBoolean(properties.getProperty("replaceExtension", "false")); final boolean continueOnAnnotateError = Boolean .parseBoolean(properties.getProperty("continueOnAnnotateError", "false")); final boolean noClobber = Boolean.parseBoolean(properties.getProperty("noClobber", "false")); // final boolean randomize = Boolean.parseBoolean(properties.getProperty("randomize", "false")); final MutableInteger totalProcessed = new MutableInteger(0); final MutableInteger totalSkipped = new MutableInteger(0); final MutableInteger totalErrorAnnotating = new MutableInteger(0); //for each file... for (final File file : files) { // Determine if there is anything to be done.... if (excludeFiles.contains(file.getName())) { logger.err("Skipping excluded file " + file.getName()); totalSkipped.incValue(1); continue; } //--Get Output File Info //(filename) String outputDir = baseOutputDir; if (baseInputDir != null) { // Get input file name relative to base String relDir = file.getParent().replaceFirst(Pattern.quote(baseInputDir), ""); outputDir = outputDir + File.separator + relDir; } // Make sure output directory exists new File(outputDir).mkdirs(); String outputFilename = new File(outputDir, file.getName()).getPath(); if (replaceExtension) { int lastDot = outputFilename.lastIndexOf('.'); // for paths like "./zzz", lastDot will be 0 if (lastDot > 0) { outputFilename = outputFilename.substring(0, lastDot); } } // ensure we don't make filenames with doubled extensions like .xml.xml if (!outputFilename.endsWith(extension)) { outputFilename += extension; } // normalize filename for the upcoming comparison outputFilename = new File(outputFilename).getCanonicalPath(); //--Conditions For Skipping The File // TODO this could fail if there are softlinks, etc. -- need some sort of sameFile tester // Java 7 will have a Files.isSymbolicLink(file) method if (outputFilename.equals(file.getCanonicalPath())) { logger.err("Skipping " + file.getName() + ": output file " + outputFilename + " has the same filename as the input file -- assuming you don't actually want to do this."); totalSkipped.incValue(1); continue; } if (noClobber && new File(outputFilename).exists()) { logger.err("Skipping " + file.getName() + ": output file " + outputFilename + " as it already exists. Don't use the noClobber option to override this."); totalSkipped.incValue(1); continue; } final String finalOutputFilename = outputFilename; //register a task... //catching exceptions... try { // Check whether this file should be skipped again if (noClobber && new File(finalOutputFilename).exists()) { logger.err("Skipping " + file.getName() + ": output file " + finalOutputFilename + " as it already exists. Don't use the noClobber option to override this."); synchronized (totalSkipped) { totalSkipped.incValue(1); } return; } logger.info("Processing file " + file.getAbsolutePath() + " ... writing to " + finalOutputFilename); //--Process File Annotation annotation = null; if (file.getAbsolutePath().endsWith(".ser.gz")) { // maybe they want to continue processing a partially processed annotation try { // Create serializers if (inputSerializerClass != null) { AnnotationSerializer inputSerializer = loadSerializer(inputSerializerClass, inputSerializerName, properties); InputStream is = new BufferedInputStream(new FileInputStream(file)); Pair<Annotation, InputStream> pair = inputSerializer.read(is); pair.second.close(); annotation = pair.first; IOUtils.closeIgnoringExceptions(is); } else { annotation = IOUtils.readObjectFromFile(file); } } catch (IOException e) { // guess that's not what they wanted // We hide IOExceptions because ones such as file not // found will be thrown again in a moment. Note that // we are intentionally letting class cast exceptions // and class not found exceptions go through. } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } //(read file) if (annotation == null) { String encoding = properties.getProperty("encoding", "UTF-8"); String text = IOUtils.slurpFile(file.getAbsoluteFile(), encoding); annotation = new Annotation(text); annotation.set(CoreAnnotations.DocIDAnnotation.class, file.getName()); } Timing timing = new Timing(); annotate.accept(annotation, finishedAnnotation -> { timing.done(logger, "Annotating file " + file.getAbsoluteFile()); Throwable ex = finishedAnnotation.get(CoreAnnotations.ExceptionAnnotation.class); if (ex == null) { try { //--Output File OutputStream fos = new BufferedOutputStream(new FileOutputStream(finalOutputFilename)); print.accept(finishedAnnotation, fos); fos.close(); } catch (IOException e) { throw new RuntimeIOException(e); } synchronized (totalProcessed) { totalProcessed.incValue(1); if (totalProcessed.intValue() % 1000 == 0) { logger.info("Processed " + totalProcessed + " documents"); } // check we've processed or errored on every file, handle tasks to run after last document if ((totalProcessed.intValue() + totalErrorAnnotating.intValue()) == files.size()) { // clear pool if necessary if (clearPool) GLOBAL_ANNOTATOR_CACHE.clear(); // print out timing info if (TIME && pipeline.isPresent() && tim.isPresent()) logTimingInfo(pipeline.get(), tim.get()); } } } else if (continueOnAnnotateError) { // Error annotating but still wanna continue // (maybe in the middle of long job and maybe next one will be okay) logger.err("Error annotating " + file.getAbsoluteFile() + ": " + ex); synchronized (totalErrorAnnotating) { totalErrorAnnotating.incValue(1); // check we've processed or errored on every file, handle tasks to run after last document if ((totalProcessed.intValue() + totalErrorAnnotating.intValue()) == files.size()) { // clear pool if necessary if (clearPool) GLOBAL_ANNOTATOR_CACHE.clear(); // print out timing info if (TIME && pipeline.isPresent() && tim.isPresent()) logTimingInfo(pipeline.get(), tim.get()); } } } else { // if stopping due to error, make sure to clear the pool if (clearPool) { GLOBAL_ANNOTATOR_CACHE.clear(); } throw new RuntimeException("Error annotating " + file.getAbsoluteFile(), ex); } }); } catch (IOException e) { throw new RuntimeIOException(e); } } } public void processFiles(final Collection<File> files, int numThreads, boolean clearPool, Optional<Timing> tim) throws IOException { processFiles(null, files, numThreads, clearPool, tim); } public void processFiles(final Collection<File> files, boolean clearPool, Optional<Timing> tim) throws IOException { processFiles(files, 1, clearPool, tim); } public void run() throws IOException { run(false); } public void run(boolean clearPool) throws IOException { Timing tim = new Timing(); StanfordRedwoodConfiguration.minimalSetup(); // multithreading thread count String numThreadsString = this.properties.getProperty("threads"); int numThreads = 1; try { if (numThreadsString != null) { numThreads = Integer.parseInt(numThreadsString); } } catch (NumberFormatException e) { logger.err("-threads [number]: was not given a valid number: " + numThreadsString); } // blank line after all the loading statements to make output more readable logger.info(""); // // Process one file or a directory of files // if (properties.containsKey("file") || properties.containsKey("textFile")) { String fileName = properties.getProperty("file"); if (fileName == null) { fileName = properties.getProperty("textFile"); } Collection<File> files = new FileSequentialCollection(new File(fileName), properties.getProperty("extension"), true); this.processFiles(null, files, numThreads, clearPool, Optional.of(tim)); } // // Process a list of files // else if (properties.containsKey("filelist")) { String fileName = properties.getProperty("filelist"); Collection<File> inputFiles = readFileList(fileName); Collection<File> files = new ArrayList<>(inputFiles.size()); for (File file : inputFiles) { if (file.isDirectory()) { files.addAll(new FileSequentialCollection(new File(fileName), properties.getProperty("extension"), true)); } else { files.add(file); } } this.processFiles(null, files, numThreads, clearPool, Optional.of(tim)); } // // Run as a filter or the interactive shell depending on whether atached to console // else { this.shell(); } // clear the pool if not running in multi-thread mode if (clearPool && numThreads == 1) { pool.clear(); } } /** * This can be used just for testing or for command-line text processing. * This runs the pipeline you specify on the * text in the file that you specify and sends some results to stdout. * The current code in this main method assumes that each line of the file * is to be processed separately as a single sentence. * <p> * Example usage:<br> * java -mx6g edu.stanford.nlp.pipeline.StanfordCoreNLP properties * * @param args List of required properties * @throws java.io.IOException If IO problem */ public static void main(String[] args) throws IOException { // // process the arguments // // Extract all the properties from the command line. // As well as command-line properties, the processor will search for the properties file in the classpath Properties props = new Properties(); if (args.length > 0) { props = StringUtils.argsToProperties(args); String helpValue = props.getProperty("h", props.getProperty("help")); if (helpValue != null) { printHelp(System.err, helpValue); return; } } // Run the pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.run(true); } }