gpl.pierrick.brihaye.aramorph.InMemoryDictionaryHandler.java Source code

Introduction

Here is the source code for gpl.pierrick.brihaye.aramorph.InMemoryDictionaryHandler.java
Source

/*
Copyright (C) 2003  Pierrick Brihaye
pierrick.brihaye@wanadoo.fr
     
Original Perl code :
Portions (c) 2002 QAMUS LLC (www.qamus.org), 
(c) 2002 Trustees of the University of Pennsylvania 
     
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
*/

package gpl.pierrick.brihaye.aramorph;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.collections.MultiHashMap;

/** An in-memory dictionary of prefixes, stems, suffixes and combinations fed with
 * resources avalaible in the classpath.
 * TODO : use a Lucene index ;-) or any other fast-access resources.
 *@author Pierrick Brihaye, 2003
 */
class InMemoryDictionaryHandler {

    /** The unique instance of this handler. */
    private static InMemoryDictionaryHandler handler = null;
    /** Dictionary of prefixes */
    private static MultiHashMap prefixes = new MultiHashMap(78);
    /** Dictionary of stems */
    private static MultiHashMap stems = new MultiHashMap(47261);
    /** Dictionary of suffixes */
    private static MultiHashMap suffixes = new MultiHashMap(206);
    /** Compatibility table for prefixes-stems combinations.
     * TODO : definitely not the best container
     */
    private static HashSet hash_AB = new HashSet(1648);
    /** Compatibility table for prefixes-suffixes combinations.
     * TODO : definitely not the best container
     */
    private static HashSet hash_AC = new HashSet(598);
    /** Compatibility table for stems-suffixes combinations.
     * TODO : definitely not the best container
     */
    private static HashSet hash_BC = new HashSet(1285);

    /** Private constructor to avoid multiple instanciations. */
    private InMemoryDictionaryHandler() {
        System.out.println("Initializing in-memory dictionary handler...");
        // load 3 lexicons
        loadDictionary(prefixes, "dictPrefixes", this.getClass().getResourceAsStream("dictionaries/dictPrefixes"));
        loadDictionary(stems, "dictStems", this.getClass().getResourceAsStream("dictionaries/dictStems"));
        loadDictionary(suffixes, "dictSuffixes", this.getClass().getResourceAsStream("dictionaries/dictSuffixes"));
        //load 3 compatibility tables
        loadCompatibilityTable(hash_AB, "tableAB", this.getClass().getResourceAsStream("dictionaries/tableAB"));
        loadCompatibilityTable(hash_AC, "tableAC", this.getClass().getResourceAsStream("dictionaries/tableAC"));
        loadCompatibilityTable(hash_BC, "tableBC", this.getClass().getResourceAsStream("dictionaries/tableBC"));
        handler = this;
        System.out.println("... done.");
    };

    /** Returns a unique instance of the handler.
     * @return The instance
     */
    protected static synchronized InMemoryDictionaryHandler getHandler() {
        if (handler == null)
            return new InMemoryDictionaryHandler();
        else
            return handler;
    }

    /** Whether or not the prefix is in the dictionary.
     * @param translitered The prefix
     * @return The result
     */
    protected static boolean hasPrefix(String translitered) {
        return prefixes.containsKey(translitered);
    }

    /** Returns an iterator on the solutions for the given prefix.
     * @param translitered The prefix
     * @return The iterator
     */
    protected Iterator getPrefixIterator(String translitered) {
        if (!prefixes.containsKey(translitered))
            return null;
        else
            return ((Collection) prefixes.get(translitered)).iterator();
    }

    /** Whether or not the stem is in the dictionary.
     * @param translitered The stem
     * @return The result
     */
    protected static boolean hasStem(String translitered) {
        return stems.containsKey(translitered);
    }

    /** Returns an iterator on the solutions for the given stem.
     * @param translitered The stem
     * @return The iterator
     */
    protected Iterator getStemIterator(String translitered) {
        if (!stems.containsKey(translitered))
            return null;
        else
            return ((Collection) stems.get(translitered)).iterator();
    }

    /** Whether or not the suffix is in the dictionary.
     * @param translitered The suffix
     * @return The result
     */
    protected static boolean hasSuffix(String translitered) {
        return suffixes.containsKey(translitered);
    }

    /** Returns an iterator on the solutions for the given suffix.
     * @param translitered The suffix
     * @return The iterator
     */
    protected Iterator getSuffixIterator(String translitered) {
        if (!suffixes.containsKey(translitered))
            return null;
        else
            return ((Collection) suffixes.get(translitered)).iterator();
    }

    /** Whether or not the prefix/stem combination is possible.
     * @param AB The prefix and stem combination.
     * @return The result
     */
    protected static boolean hasAB(String A, String B) {
        return hash_AB.contains(A + " " + B);
    }

    /** Whether or not the prefix/suffix combination is possible.
     * @param AC The prefix and suffix combination.
     * @return The result
     */
    protected static boolean hasAC(String A, String C) {
        return hash_AC.contains(A + " " + C);
    }

    /** Whether or not the stem/suffix combination is possible.
     * @param BC The stem and suffix combination.
     * @return The result
     */
    protected static boolean hasBC(String B, String C) {
        return hash_BC.contains(B + " " + C);
    }

    /** Loads a dictionary into a <CODE>Set</CODE> where the <PRE>key</PRE> is entry and its <PRE>value</PRE> is a
     * <CODE>List</CODE> (each entry can have multiple values)
     * @param set The set
     * @param name A human-readable name
     * @param is The stream
     * @throws RuntimeException If a problem occurs when reading the dictionary
     */
    private void loadDictionary(Map set, String name, InputStream is) throws RuntimeException { //TODO : should be static
        HashSet lemmas = new HashSet();
        int forms = 0;
        String lemmaID = "";
        System.out.print("Loading dictionary : " + name + " ");
        try {
            LineNumberReader IN = new LineNumberReader(new InputStreamReader(is, "ISO8859_1"));
            String line = null;
            while ((line = IN.readLine()) != null) {
                if ((IN.getLineNumber() % 1000) == 1)
                    System.out.print(".");
                // new lemma
                if (line.startsWith(";; ")) {
                    lemmaID = line.substring(3);
                    // lemmaID's must be unique
                    if (lemmas.contains(lemmaID))
                        throw new RuntimeException("Lemma " + lemmaID + "in " + name + " (line "
                                + IN.getLineNumber() + ") isn't unique");
                    lemmas.add(lemmaID);
                }
                // comment
                else if (line.startsWith(";")) {
                } else {
                    String split[] = line.split("\t", -1); //-1 to avoid triming of trail values

                    //a little error-checking won't hurt :
                    if (split.length != 4) {
                        throw new RuntimeException("Entry in " + name + " (line " + IN.getLineNumber()
                                + ") doesn't have 4 fields (3 tabs)");
                    }
                    String entry = split[0]; // get the entry for use as key
                    String vocalization = split[1];
                    String morphology = split[2];
                    String glossPOS = split[3];

                    String gloss;
                    String POS;

                    Pattern p;
                    Matcher m;

                    // two ways to get the POS info:
                    // (1) explicitly, by extracting it from the gloss field:
                    p = Pattern.compile(".*" + "<pos>(.+?)</pos>" + ".*");
                    m = p.matcher(glossPOS);
                    if (m.matches()) {
                        POS = m.group(1); //extract POS from glossPOS
                        gloss = glossPOS; //we clean up the gloss later (see below)
                    }
                    // (2) by deduction: use the morphology (and sometimes the voc and gloss) to deduce the appropriate POS
                    else {
                        // we need the gloss to guess proper names
                        gloss = glossPOS;
                        // null prefix or suffix
                        if (morphology.matches("^(Pref-0|Suff-0)$")) {
                            POS = "";
                        } else if (morphology.matches("^F" + ".*")) {
                            POS = vocalization + "/FUNC_WORD";
                        } else if (morphology.matches("^IV" + ".*")) {
                            POS = vocalization + "/VERB_IMPERFECT";
                        } else if (morphology.matches("^PV" + ".*")) {
                            POS = vocalization + "/VERB_PERFECT";
                        } else if (morphology.matches("^CV" + ".*")) {
                            POS = vocalization + "/VERB_IMPERATIVE";
                        } else if (morphology.matches("^N" + ".*")) {
                            // educated guess (99% correct)
                            if (gloss.matches("^[A-Z]" + ".*")) {
                                POS = vocalization + "/NOUN_PROP";
                            }
                            // (was NOUN_ADJ: some of these are really ADJ's and need to be tagged manually)
                            else if (vocalization.matches(".*" + "iy~$")) {
                                POS = vocalization + "/NOUN";
                            } else
                                POS = vocalization + "/NOUN";
                        } else {
                            throw new RuntimeException(
                                    "No POS can be deduced in " + name + " (line " + IN.getLineNumber() + ")");
                        }
                    }

                    // clean up the gloss: remove POS info and extra space, and convert upper-ASCII  to lower (it doesn't convert well to UTF-8)
                    gloss = gloss.replaceFirst("<pos>.+?</pos>", "");
                    gloss = gloss.trim();
                    //TODO : we definitely need a translate() method in the java packages !
                    gloss = gloss.replaceAll(";", "/"); //TODO : is it necessary ?
                    gloss = gloss.replaceAll("", "A");
                    gloss = gloss.replaceAll("", "A");
                    gloss = gloss.replaceAll("", "A");
                    gloss = gloss.replaceAll("", "A");
                    gloss = gloss.replaceAll("", "A");
                    gloss = gloss.replaceAll("", "A");
                    gloss = gloss.replaceAll("", "C");
                    gloss = gloss.replaceAll("", "E");
                    gloss = gloss.replaceAll("", "E");
                    gloss = gloss.replaceAll("", "E");
                    gloss = gloss.replaceAll("", "E");
                    gloss = gloss.replaceAll("", "I");
                    gloss = gloss.replaceAll("", "I");
                    gloss = gloss.replaceAll("", "I");
                    gloss = gloss.replaceAll("", "I");
                    gloss = gloss.replaceAll("", "N");
                    gloss = gloss.replaceAll("", "O");
                    gloss = gloss.replaceAll("", "O");
                    gloss = gloss.replaceAll("", "O");
                    gloss = gloss.replaceAll("", "O");
                    gloss = gloss.replaceAll("", "O");
                    gloss = gloss.replaceAll("", "U");
                    gloss = gloss.replaceAll("", "U");
                    gloss = gloss.replaceAll("", "U");
                    gloss = gloss.replaceAll("", "U");
                    gloss = gloss.replaceAll("", "a");
                    gloss = gloss.replaceAll("", "a");
                    gloss = gloss.replaceAll("", "a");
                    gloss = gloss.replaceAll("", "a");
                    gloss = gloss.replaceAll("", "a");
                    gloss = gloss.replaceAll("", "a");
                    gloss = gloss.replaceAll("", "c");
                    gloss = gloss.replaceAll("", "e");
                    gloss = gloss.replaceAll("", "e");
                    gloss = gloss.replaceAll("", "e");
                    gloss = gloss.replaceAll("", "e");
                    gloss = gloss.replaceAll("", "i");
                    gloss = gloss.replaceAll("", "i");
                    gloss = gloss.replaceAll("", "i");
                    gloss = gloss.replaceAll("", "i");
                    gloss = gloss.replaceAll("", "n");
                    gloss = gloss.replaceAll("", "o");
                    gloss = gloss.replaceAll("", "o");
                    gloss = gloss.replaceAll("", "o");
                    gloss = gloss.replaceAll("", "o");
                    gloss = gloss.replaceAll("", "o");
                    gloss = gloss.replaceAll("", "u");
                    gloss = gloss.replaceAll("", "u");
                    gloss = gloss.replaceAll("", "u");
                    gloss = gloss.replaceAll("", "u");
                    gloss = gloss.replaceAll("", "AE");
                    gloss = gloss.replaceAll("", "Sh");
                    gloss = gloss.replaceAll("", "Zh");
                    gloss = gloss.replaceAll("", "ss");
                    gloss = gloss.replaceAll("", "ae");
                    gloss = gloss.replaceAll("", "sh");
                    gloss = gloss.replaceAll("", "zh");
                    // note that although we read 4 fields from the dict we now save 5 fields in the hash table
                    // because the info in last field, glossPOS, was split into two: gloss and POS
                    DictionaryEntry de = new DictionaryEntry(entry, lemmaID, vocalization, morphology, gloss, POS);
                    if (set.containsKey(entry)) {
                        ((Collection) set.get(entry)).add(de);
                    } else
                        set.put(entry, de);
                    forms++;
                }
            }
            IN.close();
            System.out.println();
            if (!"".equals(lemmaID))
                System.out.print(lemmas.size() + " lemmas and ");
            System.out.println(set.size() + " entries totalizing " + forms + " forms");
        } catch (IOException e) {
            throw new RuntimeException("Can not open : " + name);
        }
    }

    /** Loads a compatibility table into a <CODE>Set</CODE>.
     * @param set The set
     * @param name A human-readable name
     * @param is The stream
     * @throws RuntimeException If a problem occurs when reading the compatibility table
     */
    private static void loadCompatibilityTable(Set set, String name, InputStream is) throws RuntimeException {
        System.out.print("Loading compatibility table : " + name + " ");
        try {
            LineNumberReader IN = new LineNumberReader(new InputStreamReader(is, "ISO8859_1"));
            String line = null;
            while ((line = IN.readLine()) != null) {
                if ((IN.getLineNumber() % 1000) == 1)
                    System.out.print(".");
                if (!line.startsWith(";")) { //Ignore comments
                    line = line.trim();
                    line = line.replaceAll("\\s+", " ");
                    set.add(line);
                }
            }
            IN.close();
            System.out.println();
            System.out.println(set.size() + " entries");
        } catch (IOException e) {
            throw new RuntimeException("Can not open : " + name);
        }
    }

}