Java tutorial
/* * POSTagger.java * * Copyright (c) 1995-2012, The University of Sheffield. See the file * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * * HepTag was originally written by Mark Hepple, this version contains * modifications by Valentin Tablan and Niraj Aswani. * * $Id$ */ /* * INSTRUCTIONS for STAND-ALONE USE * * SYNOPSIS * java hepple.postag.POSTagger [options] file1 [file2 ...] * OPTIONS: * -h, --help : displays this message * -l, --lexicon <lexicon file> : uses specified lexicon * -r, --rules <rules file> : uses specified rules * * NOTE: requires gnu.getopt package */ /** * Title: HepTag * Description: Mark Hepple's POS tagger * Copyright: Copyright (c) 2001 * Company: University of Sheffield * @author Mark Hepple * @version 1.0 */ package hepple.postag; import gate.util.BomStrippingInputStreamReader; import gnu.getopt.Getopt; import gnu.getopt.LongOpt; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import org.apache.commons.io.IOUtils; /** * A Java POS Tagger * * Author: Mark Hepple (hepple@dcs.shef.ac.uk) * * Input: An ascii text file in "Brill input format", i.e. one * sentence per line, tokens separated by spaces. * * Output: Same text with each token tagged, i.e. "token" -> "token/tag". * Output is just streamed to std-output, so commonly will direct * into some target file. * * Revision: 13/9/00. Version 1.0. * * Comments: * * Implements a version of the decision list based tagging method * described in: * * M. Hepple. 2000. Independence and Commitment: Assumptions for Rapid * Training and Execution of Rule-based Part-of-Speech Taggers. * Proceedings of the 38th Annual Meeting of the Association for * Computational Linguistics (ACL-2000). Hong Kong, October 2000. * * Modified by Niraj Aswani/Ian Roberts to allow explicit specification of the * character encoding to use when reading rules and lexicon files. * * $Id$ * */ public class POSTagger { // static final int MAXTAGS = 200; protected Map<String, List<Rule>> rules; // public Rule[] rules = new Rule[MAXTAGS]; // public Rule[] lastRules = new Rule[MAXTAGS]; Lexicon lexicon; private String encoding; static final String staart = "STAART"; private String[] staartLex = { staart }; private String[] deflex_NNP = { "NNP" }; private String[] deflex_JJ = { "JJ" }; private String[] deflex_CD = { "CD" }; private String[] deflex_NNS = { "NNS" }; private String[] deflex_RB = { "RB" }; private String[] deflex_VBG = { "VBG" }; private String[] deflex_NN = { "NN" }; public String[] wordBuff = { staart, staart, staart, staart, staart, staart, staart }; public String[] tagBuff = { staart, staart, staart, staart, staart, staart, staart }; public String[][] lexBuff = { staartLex, staartLex, staartLex, staartLex, staartLex, staartLex, staartLex }; /** * Construct a POS tagger using the platform's native encoding to read the * lexicon and rules files. */ public POSTagger(URL lexiconURL, URL rulesURL) throws InvalidRuleException, IOException { this(lexiconURL, rulesURL, null); } /** * Construct a POS tagger using the specified encoding to read the lexicon * and rules files. */ public POSTagger(URL lexiconURL, URL rulesURL, String encoding) throws InvalidRuleException, IOException { this.encoding = encoding; this.lexicon = new Lexicon(lexiconURL, encoding); rules = new HashMap<String, List<Rule>>(); readRules(rulesURL); } /** * Creates a new rule of the required type according to the provided ID. * @param ruleId the ID for the rule to be created */ public Rule createNewRule(String ruleId) throws InvalidRuleException { try { String className = "hepple.postag.rules.Rule_" + ruleId; Class<?> ruleClass = Class.forName(className); return (Rule) ruleClass.newInstance(); } catch (Exception e) { throw new InvalidRuleException("Could not create rule " + ruleId + "!\n" + e.toString()); } } /** * Runs the tagger over a set of sentences. * @param sentences a {@link java.util.List} of {@link java.util.List}s * of words to be tagged. Each list is a sentence represented as a list of * words. * @return a {@link java.util.List} of {@link java.util.List}s of * {@link java.lang.String}[]. A list of tagged sentences, each sentence * being itself a list having pairs of strings as elements with * the word on the first position and the tag on the second. */ public List<List<String[]>> runTagger(List<List<String>> sentences) { List<List<String[]>> output = new ArrayList<List<String[]>>(); List<String[]> taggedSentence = new ArrayList<String[]>(); Iterator<List<String>> sentencesIter = sentences.iterator(); while (sentencesIter.hasNext()) { List<String> sentence = sentencesIter.next(); Iterator<String> wordsIter = sentence.iterator(); while (wordsIter.hasNext()) { String newWord = wordsIter.next(); oneStep(newWord, taggedSentence); } //while(wordsIter.hasNext()) //finished adding all the words from a sentence, add six more //staarts to flush all words out of the tagging buffer for (int i = 0; i < 6; i++) { oneStep(staart, taggedSentence); } //we have a new finished sentence output.add(taggedSentence); taggedSentence = new ArrayList<String[]>(); } //while(sentencesIter.hasNext()) return output; } /** * Adds a new word to the window of 7 words (on the last position) and tags * the word currently in the middle (i.e. on position 3). This function * also reads the word on the first position and adds its tag to the * taggedSentence structure as this word would be lost at the next advance. * If this word completes a sentence then it returns true otherwise it * returns false. * @param word the new word * @param taggedSentence a List of pairs of strings representing the results * of tagging the current sentence so far. * @return returns true if a full sentence is now tagged, otherwise false. */ protected boolean oneStep(String word, List<String[]> taggedSentence) { //add the new word at the end of the text window for (int i = 1; i < 7; i++) { wordBuff[i - 1] = wordBuff[i]; tagBuff[i - 1] = tagBuff[i]; lexBuff[i - 1] = lexBuff[i]; } wordBuff[6] = word; lexBuff[6] = classifyWord(word); tagBuff[6] = lexBuff[6][0]; //apply the rules to the word in the middle of the text window //Try to fire a rule for the current lexical entry. It may be the case that //no rule applies. List<Rule> rulesToApply = rules.get(lexBuff[3][0]); if (rulesToApply != null && rulesToApply.size() > 0) { Iterator<Rule> rulesIter = rulesToApply.iterator(); //find the first rule that applies, fire it and stop. while (rulesIter.hasNext() && !(rulesIter.next()).apply(this)) { } } //save the tagged word from the first position String taggedWord = wordBuff[0]; if (taggedWord != staart) { taggedSentence.add(new String[] { taggedWord, tagBuff[0] }); if (wordBuff[1] == staart) { //wordTag[0] was the end of a sentence return true; } //if(wordBuff[1] == staart) } //if(taggedWord != staart) return false; }//protected List oneStep(String word, List taggedSentence) /** * Reads the rules from the rules input file */ @SuppressWarnings("resource") public void readRules(URL rulesURL) throws IOException, InvalidRuleException { BufferedReader rulesReader = null; try { if (encoding == null) { rulesReader = new BomStrippingInputStreamReader(rulesURL.openStream()); } else { rulesReader = new BomStrippingInputStreamReader(rulesURL.openStream(), this.encoding); } String line; Rule newRule; line = rulesReader.readLine(); while (line != null) { List<String> ruleParts = new ArrayList<String>(); StringTokenizer tokens = new StringTokenizer(line); while (tokens.hasMoreTokens()) ruleParts.add(tokens.nextToken()); if (ruleParts.size() < 3) throw new InvalidRuleException(line); newRule = createNewRule(ruleParts.get(2)); newRule.initialise(ruleParts); List<Rule> existingRules = rules.get(newRule.from); if (existingRules == null) { existingRules = new ArrayList<Rule>(); rules.put(newRule.from, existingRules); } existingRules.add(newRule); line = rulesReader.readLine(); } //while(line != null) } finally { IOUtils.closeQuietly(rulesReader); } }//public void readRules() public void showRules() { System.out.println(rules); } /** * Attempts to classify an unknown word. * @param wd the word to be classified */ protected String[] classifyWord(String wd) { String[] result; if (staart.equals(wd)) return staartLex; List<String> categories = lexicon.get(wd); if (categories != null) { result = new String[categories.size()]; for (int i = 0; i < result.length; i++) { result[i] = categories.get(i); } return result; } //no lexical entry for the word. Try to guess if ('A' <= wd.charAt(0) && wd.charAt(0) <= 'Z') return deflex_NNP; for (int i = 1; i < wd.length() - 1; i++) if (wd.charAt(i) == '-') return deflex_JJ; for (int i = 0; i < wd.length(); i++) if ('0' <= wd.charAt(i) && wd.charAt(i) <= '9') return deflex_CD; if (wd.endsWith("ed") || wd.endsWith("us") || wd.endsWith("ic") || wd.endsWith("ble") || wd.endsWith("ive") || wd.endsWith("ary") || wd.endsWith("ful") || wd.endsWith("ical") || wd.endsWith("less")) return deflex_JJ; if (wd.endsWith("s")) return deflex_NNS; if (wd.endsWith("ly")) return deflex_RB; if (wd.endsWith("ing")) return deflex_VBG; return deflex_NN; }//private String[] classifyWord(String wd) /** * Main method. Runs the tagger using the arguments to find the resources * to be used for initialisation and the input file. */ public static void main(String[] args) { if (args.length == 0) help(); try { LongOpt[] options = new LongOpt[] { new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'), new LongOpt("lexicon", LongOpt.NO_ARGUMENT, null, 'l'), new LongOpt("rules", LongOpt.NO_ARGUMENT, null, 'r') }; Getopt getopt = new Getopt("HepTag", args, "hl:r:", options); String lexiconUrlString = null; String rulesUrlString = null; int opt; while ((opt = getopt.getopt()) != -1) { switch (opt) { // -h case 'h': { help(); System.exit(0); break; } // -l new lexicon case 'l': { lexiconUrlString = getopt.getOptarg(); break; } // -l new lexicon case 'r': { rulesUrlString = getopt.getOptarg(); break; } default: { System.err.println("Invalid option " + args[getopt.getOptind() - 1] + "!"); System.exit(1); } }//switch(opt) } //while( (opt = g.getopt()) != -1 ) String[] fileNames = new String[args.length - getopt.getOptind()]; for (int i = getopt.getOptind(); i < args.length; i++) { fileNames[i - getopt.getOptind()] = args[i]; } URL lexiconURL = (lexiconUrlString == null) ? POSTagger.class.getResource("/hepple/resources/sample_lexicon") : new File(lexiconUrlString).toURI().toURL(); URL rulesURL = (rulesUrlString == null) ? POSTagger.class.getResource("/hepple/resources/sample_ruleset.big") : new File(rulesUrlString).toURI().toURL(); POSTagger tagger = new POSTagger(lexiconURL, rulesURL); for (int i = 0; i < fileNames.length; i++) { String file = fileNames[i]; BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String line = reader.readLine(); while (line != null) { StringTokenizer tokens = new StringTokenizer(line); List<String> sentence = new ArrayList<String>(); while (tokens.hasMoreTokens()) sentence.add(tokens.nextToken()); List<List<String>> sentences = new ArrayList<List<String>>(); sentences.add(sentence); List<List<String[]>> result = tagger.runTagger(sentences); Iterator<List<String[]>> iter = result.iterator(); while (iter.hasNext()) { List<String[]> sentenceFromTagger = iter.next(); Iterator<String[]> sentIter = sentenceFromTagger.iterator(); while (sentIter.hasNext()) { String[] tag = sentIter.next(); System.out.print(tag[0] + "/" + tag[1]); if (sentIter.hasNext()) System.out.print(" "); else System.out.println(); } //while(sentIter.hasNext()) } //while(iter.hasNext()) line = reader.readLine(); } //while(line != null) } finally { IOUtils.closeQuietly(reader); } // // // // List result = tagger.runTagger(readInput(file)); // Iterator iter = result.iterator(); // while(iter.hasNext()){ // List sentence = (List)iter.next(); // Iterator sentIter = sentence.iterator(); // while(sentIter.hasNext()){ // String[] tag = (String[])sentIter.next(); // System.out.print(tag[0] + "/" + tag[1]); // if(sentIter.hasNext()) System.out.print(" "); // else System.out.println(); // }//while(sentIter.hasNext()) // }//while(iter.hasNext()) } //for(int i = 0; i < fileNames.length; i++) } catch (Exception e) { e.printStackTrace(); } }//public static void main(String[] args) /** * Prints the help message */ private static void help() { System.out.println("NAME\n" + "HepTag - a Part-of-Speech tagger\n" + "see http://www.dcs.shef.ac.uk/~hepple/papers/acl00/abstract.html \n\n" + "SYNOPSIS\n\tjava hepple.postag.POSTagger [options] file1 [file2 ...]\n\n" + "OPTIONS:\n" + "-h, --help \n\tdisplays this message\n" + "-l, --lexicon <lexicon file>\n\tuses specified lexicon\n" + "-r, --rules <rules file>\n\tuses specified rules"); } /** * Reads one input file and creates the structure needed by the tagger * for input. */ @SuppressWarnings("unused") private static List<List<String>> readInput(String file) throws IOException { BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String line = reader.readLine(); List<List<String>> result = new ArrayList<List<String>>(); while (line != null) { StringTokenizer tokens = new StringTokenizer(line); List<String> sentence = new ArrayList<String>(); while (tokens.hasMoreTokens()) sentence.add(tokens.nextToken()); result.add(sentence); line = reader.readLine(); } //while(line != null) return result; } finally { IOUtils.closeQuietly(reader); } }//private static List readInput(File file) throws IOException }//public class POSTagger