Java tutorial
/* * $Id: Pep.java 1812 2010-02-08 22:06:32Z scott $ * Copyright (C) 2007 Scott Martin * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the * Free Software Foundation; either version 2.1 of the License, or (at your * option) any later version. The GNU Lesser General Public License is * distributed with this software in the file COPYING. */ package edu.osu.ling.pep; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.EnumMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Scanner; import java.util.Set; import java.util.regex.Pattern; import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamSource; import javax.xml.validation.Schema; import javax.xml.validation.SchemaFactory; import javax.xml.validation.Validator; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import com.google.common.base.Objects; /** * Command line front end for {@link EarleyParser Earley parsers}. * <p> * In addition to tokenizing input strings and loading the {@link Grammar} that * a parser will use, Pep also a {@link ParserListener listens} for events from * the Earley parsers it invokes. An example invocation might be something like: * <ol> * <li> * <code>pep -g samples/miniscule.xml -s S the boy left -v</code></li> * <li> * <code>echo "the boy left" | pep -g samples/miniscule.xml -v 2 -s S - * </code></li> * </ol> * * The above commands cause Pep to create an Earley parser using the grammar * specified in the file <code>samples/miniscule.xml</code> for the * {@link Parse#getSeed() seed category} <code>S</code> and the input string * "<code>the boy left</code>". Example (2) shows how to configure Pep * to read input from the standard input stream. For either of these commands, * Pep prints out the result of the parse as well as all {@link ParseTree parse * trees} for the specified string. * </p> * * @author <a href="http://www.ling.osu.edu/~scott/">Scott Martin</a> * @version $LastChangedRevision: 1812 $ * @see EarleyParser */ public class Pep implements ParserListener { static final float VERSION = 0.4f; static final int V_ALL = 0, V_RECOGNITION = 0, V_WARN = 1, V_PARSE = 1, V_CHART = 2, V_GRAMMAR = 3, V_STATS = 3, V_DEBUG = 3; static int verbosity; public static void setVerbosity(final int verbosity) { Pep.verbosity = verbosity; } EarleyParser earleyParser; Map<ParserOption, Boolean> parserOptions; long lastParseStart; public Parse parse; /** * Do not allow this class to be instantiated except by its own * {@link Pep#main(String[]) main} method. */ public Pep(final Map<ParserOption, Boolean> parserOptions) { this.parserOptions = parserOptions; } /** * Parses a string (list of tokens) using the specified grammar and seed * category. * * @param grammar * The grammar to use in parsing. * @param tokens * The string to parse. * @param seed * The category to seed the parser with. */ synchronized public Parse parse(final Grammar grammar, final List<String> tokens, final Category seed) throws PepException { if (earleyParser == null) { earleyParser = new EarleyParser(grammar, this); if (parserOptions != null) { // configure parser if options present for (final Map.Entry<ParserOption, Boolean> entry : parserOptions.entrySet()) { earleyParser.setOption(entry.getKey(), entry.getValue()); } } } if (Pep.verbosity >= Pep.V_STATS) { Pep.printMessage("Parsing " + tokens + " for category " + seed, Pep.V_STATS); } return earleyParser.parse(tokens, seed); } /** * Consumes events generated when options are set on the parser. */ public void optionSet(final ParserOptionEvent optionEvent) { if (Pep.verbosity >= Pep.V_DEBUG) { Pep.printMessage("Option set: " + optionEvent.option.name() + "=" + optionEvent.value, Pep.V_DEBUG); } } /** * Consumes events generated when the parser is seeded. */ public void parserSeeded(final EdgeEvent edgeEvent) { Pep.printParser(edgeEvent.index, "seed ", edgeEvent.edge); } /** * Consumes events generated when edges are added to the parser's chart * because of completion. */ public void edgeCompleted(final EdgeEvent edgeEvent) { Pep.printParser(edgeEvent.index, "complete", edgeEvent.edge); } /** * Consumes events generated when edges are added to the parser's chart * because of prediction. */ public void edgePredicted(final EdgeEvent edgeEvent) { Pep.printParser(edgeEvent.index, "predict ", edgeEvent.edge); } /** * Consumes events generated when the parser scans a token from the input * string. */ public void edgeScanned(final EdgeEvent edgeEvent) { Pep.printParser(edgeEvent.index, "scan ", edgeEvent.edge); } /** * Consumes events generated when the parser completes a parse. */ public void parseComplete(final ParseEvent parseEvent) { final long now = System.currentTimeMillis(); parse = parseEvent.parse; Pep.print("", Pep.V_CHART); // only if other output above Pep.print(parse, Pep.V_RECOGNITION); if (Pep.verbosity >= Pep.V_PARSE) { final Set<ParseTree> parseTrees = parse.getParseTrees(); if (!parseTrees.isEmpty()) { int count = 1; for (final ParseTree pt : parseTrees) { Pep.print(count++ + ". " + pt.toString(), Pep.V_PARSE); } } } if (Pep.verbosity >= Pep.V_STATS) { Pep.printMessage("", Pep.V_STATS); Pep.printMessage("Parse complete: " + parse.chart.countEdges() + " edges added to chart in " + (now - lastParseStart) + " ms", Pep.V_STATS); } } public void parseMessage(final ParseEvent parseEvent, final String message) { Pep.printParser(parseEvent.index, "message ", message); } public void parseError(final ParseErrorEvent parseErrorEvent) throws PepException { Pep.printParser(parseErrorEvent.index, "error ", parseErrorEvent.cause); } /** * Prints an object to System.out. * * @param line * The object to print out. * @param requiredVerbosity * The required verbosity level for this message to actually be * printed out. */ private static void print(final Object line, final int requiredVerbosity) { if (Pep.verbosity >= requiredVerbosity) { System.out.println(line); } } /** * Prints an object to System.err. * * @param line * The object to print out. * @param requiredVerbosity * The required verbosity level for this message to actually be * printed out. */ static void printMessage(final String message, final int requiredVerbosity) { if (Pep.verbosity >= requiredVerbosity) { System.err.println(message); } } /** * Prints an error message to System.err. */ private static void printError(final String msg) { Pep.printMessage("Error: " + msg, Pep.V_ALL); } /** * Prints a throwable. * * @param error * The throwable that was intercepted. * @see #printError(String) */ private static void printError(final Throwable error) { if (error instanceof SAXParseException) { final SAXParseException spe = (SAXParseException) error; Pep.printError("line " + spe.getLineNumber() + ": " + spe.getMessage()); } else { String msg = error.getMessage(); final Throwable cause = error.getCause(); if (cause != null && !cause.equals(error)) { msg += ": " + cause.getMessage(); } Pep.printError(msg); } } /** * Prints a warning string. * * @see #print(Object, int) */ static void printWarning(final String msg) { Pep.printMessage("Warning: " + msg, Pep.V_WARN); } /** * Prints usage information for the Pep executable. */ private static void printHelp(final Options options) { final String name = Pep.class.getSimpleName(); Pep.printMessage(name + " is an Earley Parser, version " + Pep.VERSION, Pep.V_ALL); Pep.printMessage("", Pep.V_ALL); Pep.printMessage(name + " is free software, copyright (C) 2007 Scott Martin.", Pep.V_ALL); Pep.printMessage("See the COPYING file for details.", Pep.V_ALL); Pep.printMessage("", Pep.V_ALL); final HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("pep [options] [string] (use - for stdin)", options); Pep.printMessage("", Pep.V_ALL); Pep.printMessage("where the default OPTION=value pairs are:", Pep.V_ALL); for (final ParserOption option : ParserOption.values()) { Pep.printMessage(' ' + option.toString(), Pep.V_ALL); } } /** * Auxiliary method used by event handlers to print out information about * what the parser is doing in a readable format. */ private static void printParser(final int index, final String label, final Object obj) { if (Pep.verbosity >= Pep.V_CHART) { final StringBuilder sb = new StringBuilder(index + ": " + label + "\t"); if (obj instanceof String) { sb.append('\''); sb.append(obj); sb.append('\''); } else if (obj instanceof Throwable) { sb.append(((Throwable) obj).getMessage()); } else { sb.append(obj); } Pep.print(sb.toString(), Pep.V_CHART); } } /** * Invokes Pep from the command line. * <p> * The main work this method does, apart from tokenizing the arguments and * input tokens, is to load and parse the XML grammar file (as specified by * <code>-g</code> or <code>--grammar</code>). If any of the arguments * <code>-g</code>, <code>--grammar</code>, <code>-s</code>, * <code>--seed</code>, <code>-o</code>, <code>--option</code>, occur with * no argument following, this method prints an error notifying the user. * * @param args * The expected arguments are as follows, and can occur in any * particular order: * <ul> * <li><code>-g|--grammar <grammar file></code></li> <li> * <code>-s|--seed <seed category></code></li> <li><code> * -v|--verbose {verbosity level}</code></li> <li><code> * -o|--option <OPTION_NAME=value></code></li> <li><code> * -h|--help (prints usage information)</code></li> <li><code> * <token1 ... token<em>n</em>></code> (or <code>-</code> * for standard input)</li> * </ul> * <code>OPTION_NAME</code> must be the name of one of the * recognized {@link ParserOption options}. If <code>-h</code> or * <code>--help</code> occur anywhere in the arguments, usage * information is printed and no parsing takes place. */ @SuppressWarnings("static-access") public static final void main(final String[] args) { try { final Options opts = new Options(); opts.addOption(OptionBuilder.withLongOpt("grammar").withDescription("the grammar to use").hasArg() .isRequired().withArgName("grammar file").create('g')); opts.addOption(OptionBuilder.withLongOpt("seed").withDescription("the seed category to parse for") .hasArg().isRequired().withArgName("seed category").create('s')); opts.addOption(OptionBuilder.withLongOpt("verbose").withDescription("0-3").hasOptionalArg() .withArgName("verbosity level").create('v')); opts.addOption(OptionBuilder.withLongOpt("option").withDescription("sets parser options") .withArgName("OPTION=value").hasArgs(2).withValueSeparator() .withDescription("use value for given property").create("o")); opts.addOption(OptionBuilder.withLongOpt("help").withDescription("prints this message").create('h')); final CommandLineParser parser = new GnuParser(); try { final CommandLine line = parser.parse(opts, args); if (line.hasOption('h')) { Pep.printHelp(opts); } else { final int v = Integer.parseInt(line.getOptionValue('v', Integer.toString(Pep.V_PARSE))); if (v < 0) { throw new PepException("verbosity < 0: " + v); } Pep.verbosity = v; final Map<ParserOption, Boolean> options = new EnumMap<ParserOption, Boolean>( ParserOption.class); final Properties props = line.getOptionProperties("o"); for (final Object key : props.keySet()) { try { options.put(ParserOption.valueOf(key.toString()), Boolean.valueOf(props.get(key).toString())); } catch (final IllegalArgumentException iae) { Pep.printError("no option named " + key.toString()); Pep.printHelp(opts); return; } } final Pep pep = new Pep(options); // final Grammar grammar = // new GrammarParser(Pep.findGrammar(line // .getOptionValue('g'))).t.parse(); final List<?> ts = line.getArgList(); List<String> tokens = null; if (ts.isEmpty() || ts.get(0).equals("-")) { tokens = Pep.readTokens(new Scanner(System.in)); } else { tokens = new ArrayList<String>(ts.size()); for (final Object t : ts) { tokens.add(t.toString()); } } pep.lastParseStart = System.currentTimeMillis(); // try { // pep.parse(grammar, tokens, // new Category(line.getOptionValue('s'))); // } catch (final PepException ignore) { // // ignore here, we're listening // } } } catch (final ParseException pe) { Pep.printError("command-line syntax problem: " + pe.getMessage()); Pep.printHelp(opts); } } catch (final PepException pe) { final Throwable cause = pe.getCause(); Pep.printError((cause == null) ? pe : cause); } catch (final RuntimeException re) { Pep.printError(re); } } /** * Locates the grammar file as specified on the command line. Attempts to * find the file based on the current directory. * * @param grammarLoc * The grammar location string specified on the command line. * @return The located file. * @throws PepException * If the file does not exist or is a directory. */ public static File findGrammar(final String grammarLoc) throws PepException { final File g = new File(System.getProperty("user.dir"), grammarLoc); if (!g.exists()) { throw new PepException("grammar file does not exist"); } if (!g.isFile()) { throw new PepException("specified grammar is not a file"); } return g; } /** * Tokenizes the string input that occurs on the command line, removing * " characters. * * @param args * The strings that occurred after <code>-t</code> or * <code>--tokens</code> on the command line. * @return A list of tokens suitable for use in parsing. */ private static List<String> readTokens(final Iterator<String> args) { final List<String> tokens = new ArrayList<String>(); final Pattern q = Pattern.compile("\""); while (args.hasNext()) { final String t = args.next(); if (t == null) { // let parser deal with this tokens.add(t); } else { if (tokens.isEmpty()) { // it's the first token if (t.startsWith("\"")) { // strip off leading quote tokens.add(q.matcher(t).replaceAll("")); } else { tokens.add(t); } } else { if (t.endsWith("\"")) { // it's the last token // strip off trailing quote tokens.add(q.matcher(t).replaceAll("")); break; // done parsing } tokens.add(t); // no quotes to strip } } } return tokens; } /** * Parses XML grammar files. */ public static class GrammarParser implements ErrorHandler { File grammarFile; DocumentBuilder documentBuilder; /** * Create a grammar parser for the specified file. */ public GrammarParser() throws PepException { this.grammarFile = grammarFile; try { documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); documentBuilder.setErrorHandler(this); } catch (final ParserConfigurationException pce) { throw new PepException("problem instantiating parser", pce); } } /** * Parse the XML file specified when this grammar parser was created, * returning a grammar. *TODO* add categories for case, gender etc. */ public Grammar parse(final String xml) throws PepException { Grammar g = null; try { StringReader sr = new StringReader(xml); final Document d = documentBuilder.parse(new InputSource(sr)); // validate final SchemaFactory sf = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); final StreamSource src = new StreamSource( getClass().getClassLoader().getResourceAsStream("grammar.xsd")); if (src.getInputStream() == null) { Pep.printMessage("Unable to locate grammar.xsd", Pep.V_ALL); } else { final Schema schema = sf.newSchema(src); final Validator vlad = schema.newValidator(); try { vlad.validate(new DOMSource(d)); } catch (final SAXException se) { throw new PepException("invalid grammar", se); } } final Element root = d.getDocumentElement(); g = new Grammar(root.getAttribute("name")); Pep.printMessage("Loading grammar " + g.name + " from " + "BaseX", Pep.V_GRAMMAR); final NodeList rules = root.getElementsByTagName("rule"); Element ruleEl; Category left; List<Category> right; for (int i = 0; i < rules.getLength(); i++) { ruleEl = (Element) rules.item(i); String addrules = ruleEl.getAttributeNode("attr") != null ? ruleEl.getAttributeNode("attr").getNodeValue() : ""; left = new Category(ruleEl.getAttribute("category"), false, addrules); final NodeList rightList = ruleEl.getChildNodes(); right = new ArrayList<Category>(rightList.getLength()); Node rightNode; Element rightEl; for (int j = 0; j < rightList.getLength(); j++) { rightNode = rightList.item(j); if (rightNode instanceof Element) { rightEl = (Element) rightNode; final Attr termAttr = rightEl.getAttributeNode("terminal"); final Attr attrs = rightEl.getAttributeNode("attr"); right.add(new Category(rightEl.getAttribute("name"), termAttr != null && termAttr.getTextContent().equals("true"), attrs != null ? attrs.getTextContent() : "")); } } g.addRule(new Rule(left, right.toArray(new Category[right.size()]))); } } catch (final IllegalArgumentException iae) { throw new PepException("problem loading grammar", iae); } catch (final SAXException se) { throw new PepException("problem parsing", se); } catch (final IOException io) { throw new PepException("problem reading grammar", io); } Pep.printMessage(g.toString(), Pep.V_GRAMMAR); Pep.printMessage("", Pep.V_GRAMMAR); return g; } /** * Does nothing because an exception are generated. Present for binary * compatibility with ErrorHandler. */ public void error(final SAXParseException e) throws SAXException { // do nothing } /** * Does nothing because an exception are generated. Present for binary * compatibility with ErrorHandler. */ public void fatalError(final SAXParseException e) throws SAXException { // do nothing } /** * Event handler for warnings that occur during parsing. Prints a * warning message to System.err. * * @see Pep#printWarning(String) */ public void warning(final SAXParseException e) throws SAXException { Pep.printWarning("line " + e.getLineNumber() + ": " + e.getMessage()); } } }