hepple.postag.POSTagger.java Source code

Introduction

Here is the source code for hepple.postag.POSTagger.java
Source

/*
 *  POSTagger.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  HepTag was originally written by Mark Hepple, this version contains
 *  modifications by Valentin Tablan and Niraj Aswani.
 *
 *  $Id$
 */

/*
 * INSTRUCTIONS for STAND-ALONE USE
 *
 * SYNOPSIS
 *     java hepple.postag.POSTagger [options] file1 [file2 ...]
 * OPTIONS:
 *     -h, --help : displays this message
 *     -l, --lexicon <lexicon file> : uses specified lexicon
 *     -r, --rules <rules file> : uses specified rules
 *
 * NOTE: requires gnu.getopt package
 */

/**
 * Title:        HepTag
 * Description:  Mark Hepple's POS tagger
 * Copyright:    Copyright (c) 2001
 * Company:      University of Sheffield
 * @author Mark Hepple
 * @version 1.0
 */
package hepple.postag;

import gate.util.BomStrippingInputStreamReader;
import gnu.getopt.Getopt;
import gnu.getopt.LongOpt;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import org.apache.commons.io.IOUtils;

/**
 * A Java POS Tagger
 *
 * Author: Mark Hepple (hepple@dcs.shef.ac.uk)
 *
 * Input:  An ascii text file in "Brill input format", i.e. one
 *        sentence per line, tokens separated by spaces.
 *
 * Output: Same text with each token tagged, i.e. "token" -> "token/tag".
 *        Output is just streamed to std-output, so commonly will direct
 *        into some target file.
 *
 * Revision: 13/9/00. Version 1.0.
 *
 * Comments:
 *
 * Implements a version of the decision list based tagging method
 * described in:
 *
 * M. Hepple. 2000. Independence and Commitment: Assumptions for Rapid
 * Training and Execution of Rule-based Part-of-Speech Taggers.
 * Proceedings of the 38th Annual Meeting of the Association for
 * Computational Linguistics (ACL-2000). Hong Kong, October 2000.
 *
 * Modified by Niraj Aswani/Ian Roberts to allow explicit specification of the
 * character encoding to use when reading rules and lexicon files.
 *
 * $Id$
 *
 */

public class POSTagger {

    //    static final int MAXTAGS = 200;

    protected Map<String, List<Rule>> rules;
    //    public Rule[] rules = new Rule[MAXTAGS];
    //    public Rule[] lastRules = new Rule[MAXTAGS];

    Lexicon lexicon;

    private String encoding;

    static final String staart = "STAART";

    private String[] staartLex = { staart };
    private String[] deflex_NNP = { "NNP" };
    private String[] deflex_JJ = { "JJ" };
    private String[] deflex_CD = { "CD" };
    private String[] deflex_NNS = { "NNS" };
    private String[] deflex_RB = { "RB" };
    private String[] deflex_VBG = { "VBG" };
    private String[] deflex_NN = { "NN" };

    public String[] wordBuff = { staart, staart, staart, staart, staart, staart, staart };

    public String[] tagBuff = { staart, staart, staart, staart, staart, staart, staart };
    public String[][] lexBuff = { staartLex, staartLex, staartLex, staartLex, staartLex, staartLex, staartLex };

    /**
     * Construct a POS tagger using the platform's native encoding to read the
     * lexicon and rules files.
     */
    public POSTagger(URL lexiconURL, URL rulesURL) throws InvalidRuleException, IOException {
        this(lexiconURL, rulesURL, null);
    }

    /**
     * Construct a POS tagger using the specified encoding to read the lexicon
     * and rules files.
     */
    public POSTagger(URL lexiconURL, URL rulesURL, String encoding) throws InvalidRuleException, IOException {
        this.encoding = encoding;
        this.lexicon = new Lexicon(lexiconURL, encoding);
        rules = new HashMap<String, List<Rule>>();
        readRules(rulesURL);
    }

    /**
     * Creates a new rule of the required type according to the provided ID.
     * @param ruleId the ID for the rule to be created
     */
    public Rule createNewRule(String ruleId) throws InvalidRuleException {
        try {
            String className = "hepple.postag.rules.Rule_" + ruleId;
            Class<?> ruleClass = Class.forName(className);
            return (Rule) ruleClass.newInstance();
        } catch (Exception e) {
            throw new InvalidRuleException("Could not create rule " + ruleId + "!\n" + e.toString());
        }
    }

    /**
     * Runs the tagger over a set of sentences.
     * @param sentences a {@link java.util.List} of {@link java.util.List}s
     * of words to be tagged. Each list is a sentence represented as a list of
     * words.
     * @return a {@link java.util.List} of {@link java.util.List}s of
     * {@link java.lang.String}[]. A list of tagged sentences, each sentence
     * being itself a list having pairs of strings as elements with
     * the word on the first position and the tag on the second.
     */
    public List<List<String[]>> runTagger(List<List<String>> sentences) {
        List<List<String[]>> output = new ArrayList<List<String[]>>();
        List<String[]> taggedSentence = new ArrayList<String[]>();
        Iterator<List<String>> sentencesIter = sentences.iterator();
        while (sentencesIter.hasNext()) {
            List<String> sentence = sentencesIter.next();
            Iterator<String> wordsIter = sentence.iterator();
            while (wordsIter.hasNext()) {
                String newWord = wordsIter.next();
                oneStep(newWord, taggedSentence);
            } //while(wordsIter.hasNext())
              //finished adding all the words from a sentence, add six more
              //staarts to flush all words out of the tagging buffer
            for (int i = 0; i < 6; i++) {
                oneStep(staart, taggedSentence);
            }
            //we have a new finished sentence
            output.add(taggedSentence);
            taggedSentence = new ArrayList<String[]>();
        } //while(sentencesIter.hasNext())
        return output;
    }

    /**
     * Adds a new word to the window of 7 words (on the last position) and tags
     * the word currently in the middle (i.e. on position 3). This function
     * also reads the word on the first position and adds its tag to the
     * taggedSentence structure as this word would be lost at the next advance.
     * If this word completes a sentence then it returns true otherwise it
     * returns false.
     * @param word the new word
     * @param taggedSentence a List of pairs of strings representing the results
     * of tagging the current sentence so far.
     * @return returns true if a full sentence is now tagged, otherwise false.
     */
    protected boolean oneStep(String word, List<String[]> taggedSentence) {
        //add the new word at the end of the text window
        for (int i = 1; i < 7; i++) {
            wordBuff[i - 1] = wordBuff[i];
            tagBuff[i - 1] = tagBuff[i];
            lexBuff[i - 1] = lexBuff[i];
        }
        wordBuff[6] = word;
        lexBuff[6] = classifyWord(word);
        tagBuff[6] = lexBuff[6][0];

        //apply the rules to the word in the middle of the text window
        //Try to fire a rule for the current lexical entry. It may be the case that
        //no rule applies.
        List<Rule> rulesToApply = rules.get(lexBuff[3][0]);
        if (rulesToApply != null && rulesToApply.size() > 0) {
            Iterator<Rule> rulesIter = rulesToApply.iterator();
            //find the first rule that applies, fire it and stop.
            while (rulesIter.hasNext() && !(rulesIter.next()).apply(this)) {
            }
        }

        //save the tagged word from the first position
        String taggedWord = wordBuff[0];
        if (taggedWord != staart) {
            taggedSentence.add(new String[] { taggedWord, tagBuff[0] });
            if (wordBuff[1] == staart) {
                //wordTag[0] was the end of a sentence
                return true;
            } //if(wordBuff[1] == staart)
        } //if(taggedWord != staart)
        return false;

    }//protected List oneStep(String word, List taggedSentence)

    /**
     * Reads the rules from the rules input file
     */
    @SuppressWarnings("resource")
    public void readRules(URL rulesURL) throws IOException, InvalidRuleException {
        BufferedReader rulesReader = null;

        try {
            if (encoding == null) {
                rulesReader = new BomStrippingInputStreamReader(rulesURL.openStream());
            } else {
                rulesReader = new BomStrippingInputStreamReader(rulesURL.openStream(), this.encoding);
            }

            String line;
            Rule newRule;

            line = rulesReader.readLine();
            while (line != null) {
                List<String> ruleParts = new ArrayList<String>();
                StringTokenizer tokens = new StringTokenizer(line);
                while (tokens.hasMoreTokens())
                    ruleParts.add(tokens.nextToken());
                if (ruleParts.size() < 3)
                    throw new InvalidRuleException(line);

                newRule = createNewRule(ruleParts.get(2));
                newRule.initialise(ruleParts);
                List<Rule> existingRules = rules.get(newRule.from);
                if (existingRules == null) {
                    existingRules = new ArrayList<Rule>();
                    rules.put(newRule.from, existingRules);
                }
                existingRules.add(newRule);

                line = rulesReader.readLine();
            } //while(line != null)
        } finally {
            IOUtils.closeQuietly(rulesReader);
        }
    }//public void readRules()

    public void showRules() {
        System.out.println(rules);
    }

    /**
     * Attempts to classify an unknown word.
     * @param wd the word to be classified
     */
    protected String[] classifyWord(String wd) {
        String[] result;

        if (staart.equals(wd))
            return staartLex;

        List<String> categories = lexicon.get(wd);
        if (categories != null) {
            result = new String[categories.size()];
            for (int i = 0; i < result.length; i++) {
                result[i] = categories.get(i);
            }
            return result;
        }

        //no lexical entry for the word. Try to guess
        if ('A' <= wd.charAt(0) && wd.charAt(0) <= 'Z')
            return deflex_NNP;

        for (int i = 1; i < wd.length() - 1; i++)
            if (wd.charAt(i) == '-')
                return deflex_JJ;

        for (int i = 0; i < wd.length(); i++)
            if ('0' <= wd.charAt(i) && wd.charAt(i) <= '9')
                return deflex_CD;

        if (wd.endsWith("ed") || wd.endsWith("us") || wd.endsWith("ic") || wd.endsWith("ble") || wd.endsWith("ive")
                || wd.endsWith("ary") || wd.endsWith("ful") || wd.endsWith("ical") || wd.endsWith("less"))
            return deflex_JJ;

        if (wd.endsWith("s"))
            return deflex_NNS;

        if (wd.endsWith("ly"))
            return deflex_RB;

        if (wd.endsWith("ing"))
            return deflex_VBG;

        return deflex_NN;
    }//private String[] classifyWord(String wd)

    /**
     * Main method. Runs the tagger using the arguments to find the resources
     * to be used for initialisation and the input file.
     */
    public static void main(String[] args) {
        if (args.length == 0)
            help();
        try {
            LongOpt[] options = new LongOpt[] { new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'),
                    new LongOpt("lexicon", LongOpt.NO_ARGUMENT, null, 'l'),
                    new LongOpt("rules", LongOpt.NO_ARGUMENT, null, 'r') };
            Getopt getopt = new Getopt("HepTag", args, "hl:r:", options);
            String lexiconUrlString = null;
            String rulesUrlString = null;
            int opt;
            while ((opt = getopt.getopt()) != -1) {
                switch (opt) {
                // -h
                case 'h': {
                    help();
                    System.exit(0);
                    break;
                }
                // -l new lexicon
                case 'l': {
                    lexiconUrlString = getopt.getOptarg();
                    break;
                }
                // -l new lexicon
                case 'r': {
                    rulesUrlString = getopt.getOptarg();
                    break;
                }
                default: {
                    System.err.println("Invalid option " + args[getopt.getOptind() - 1] + "!");
                    System.exit(1);
                }
                }//switch(opt)
            } //while( (opt = g.getopt()) != -1 )
            String[] fileNames = new String[args.length - getopt.getOptind()];
            for (int i = getopt.getOptind(); i < args.length; i++) {
                fileNames[i - getopt.getOptind()] = args[i];
            }

            URL lexiconURL = (lexiconUrlString == null)
                    ? POSTagger.class.getResource("/hepple/resources/sample_lexicon")
                    : new File(lexiconUrlString).toURI().toURL();

            URL rulesURL = (rulesUrlString == null)
                    ? POSTagger.class.getResource("/hepple/resources/sample_ruleset.big")
                    : new File(rulesUrlString).toURI().toURL();

            POSTagger tagger = new POSTagger(lexiconURL, rulesURL);

            for (int i = 0; i < fileNames.length; i++) {
                String file = fileNames[i];
                BufferedReader reader = null;

                try {
                    reader = new BufferedReader(new FileReader(file));

                    String line = reader.readLine();

                    while (line != null) {
                        StringTokenizer tokens = new StringTokenizer(line);
                        List<String> sentence = new ArrayList<String>();
                        while (tokens.hasMoreTokens())
                            sentence.add(tokens.nextToken());
                        List<List<String>> sentences = new ArrayList<List<String>>();
                        sentences.add(sentence);
                        List<List<String[]>> result = tagger.runTagger(sentences);

                        Iterator<List<String[]>> iter = result.iterator();
                        while (iter.hasNext()) {
                            List<String[]> sentenceFromTagger = iter.next();
                            Iterator<String[]> sentIter = sentenceFromTagger.iterator();
                            while (sentIter.hasNext()) {
                                String[] tag = sentIter.next();
                                System.out.print(tag[0] + "/" + tag[1]);
                                if (sentIter.hasNext())
                                    System.out.print(" ");
                                else
                                    System.out.println();
                            } //while(sentIter.hasNext())
                        } //while(iter.hasNext())
                        line = reader.readLine();
                    } //while(line != null)
                } finally {
                    IOUtils.closeQuietly(reader);
                }
                //
                //
                //
                //        List result = tagger.runTagger(readInput(file));
                //        Iterator iter = result.iterator();
                //        while(iter.hasNext()){
                //          List sentence = (List)iter.next();
                //          Iterator sentIter = sentence.iterator();
                //          while(sentIter.hasNext()){
                //            String[] tag = (String[])sentIter.next();
                //            System.out.print(tag[0] + "/" + tag[1]);
                //            if(sentIter.hasNext()) System.out.print(" ");
                //            else System.out.println();
                //          }//while(sentIter.hasNext())
                //        }//while(iter.hasNext())
            } //for(int i = 0; i < fileNames.length; i++)
        } catch (Exception e) {
            e.printStackTrace();
        }
    }//public static void main(String[] args)

    /**
     * Prints the help message
     */
    private static void help() {
        System.out.println("NAME\n" + "HepTag - a Part-of-Speech tagger\n"
                + "see http://www.dcs.shef.ac.uk/~hepple/papers/acl00/abstract.html \n\n"
                + "SYNOPSIS\n\tjava hepple.postag.POSTagger [options] file1 [file2 ...]\n\n" + "OPTIONS:\n"
                + "-h, --help \n\tdisplays this message\n"
                + "-l, --lexicon <lexicon file>\n\tuses specified lexicon\n"
                + "-r, --rules <rules file>\n\tuses specified rules");
    }

    /**
     * Reads one input file and creates the structure needed by the tagger
     * for input.
     */
    @SuppressWarnings("unused")
    private static List<List<String>> readInput(String file) throws IOException {
        BufferedReader reader = null;

        try {
            reader = new BufferedReader(new FileReader(file));

            String line = reader.readLine();
            List<List<String>> result = new ArrayList<List<String>>();
            while (line != null) {
                StringTokenizer tokens = new StringTokenizer(line);
                List<String> sentence = new ArrayList<String>();
                while (tokens.hasMoreTokens())
                    sentence.add(tokens.nextToken());
                result.add(sentence);
                line = reader.readLine();
            } //while(line != null)
            return result;
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }//private static List readInput(File file) throws IOException

}//public class POSTagger