hepple.postag.Lexicon.java Source code

Java tutorial

Introduction

Here is the source code for hepple.postag.Lexicon.java

Source

/*
 *  Lexicon.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  HepTag was originally written by Mark Hepple, this version contains
 *  modifications by Valentin Tablan and Niraj Aswani.
 *
 *  $Id$
 */
package hepple.postag;

/**
 * Title:        HepTag
 * Description:  Mark Hepple's POS tagger
 * Copyright:    Copyright (c) 2001
 * Company:      University of Sheffield
 * @author Mark Hepple
 * @version 1.0
 */

import gate.util.BomStrippingInputStreamReader;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.commons.io.IOUtils;

/**
 * A {@link java.util.HashMap} that maps from lexical entry
 * ({@link java.lang.String}) to possible POS categories
 * ({@link java.util.List}
 */
class Lexicon extends HashMap<String, List<String>> {

    private static final long serialVersionUID = -2320126076517881896L;

    /**
     * Constructor.
     * @param lexiconURL an URL for the file contianing the lexicon.
     */
    public Lexicon(URL lexiconURL) throws IOException {
        this(lexiconURL, null);
    }

    /**
     * Constructor.
     * @param lexiconURL an URL for the file containing the lexicon.
     * @param encoding the character encoding to use for reading the lexicon.
     */
    public Lexicon(URL lexiconURL, String encoding) throws IOException {
        String line;
        BufferedReader lexiconReader = null;
        InputStream lexiconStream = null;

        try {
            lexiconStream = lexiconURL.openStream();

            if (encoding == null) {
                lexiconReader = new BomStrippingInputStreamReader(lexiconStream);
            } else {
                lexiconReader = new BomStrippingInputStreamReader(lexiconStream, encoding);
            }

            line = lexiconReader.readLine();
            String entry;
            List<String> categories;
            while (line != null) {
                StringTokenizer tokens = new StringTokenizer(line);
                entry = tokens.nextToken();
                categories = new ArrayList<String>();
                while (tokens.hasMoreTokens())
                    categories.add(tokens.nextToken());
                put(entry, categories);

                line = lexiconReader.readLine();
            } //while(line != null)
        } finally {
            IOUtils.closeQuietly(lexiconReader);
            IOUtils.closeQuietly(lexiconStream);
        }
    }//public Lexicon(URL lexiconURL) throws IOException

}//class Lexicon