net.java.sen.tools.DictionaryMaker.java Source code

Introduction

Here is the source code for net.java.sen.tools.DictionaryMaker.java
Source

/*
 * DictionaryMaker.java - DictionaryMaker utility to make dictionary.
 * 
 * Copyright (C) 2001, 2002 Taku Kudoh, Takashi Okamoto Taku Kudoh
 * <taku-ku@is.aist-nara.ac.jp> Takashi Okamoto <tora@debian.org>
 * 
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *  
 */

package net.java.sen.tools;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.StringTokenizer;
import java.util.Vector;

import net.java.sen.util.CSVParser;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class DictionaryMaker {
    private static Log log = LogFactory.getLog(DictionaryMaker.class);

    public static boolean debug = false;
    // variation information is stored.
    LinkedHashSet ruleSet = new LinkedHashSet();
    // idList[ID] = ????
    public Vector idList = new Vector();

    // ruleList[ID] = splited valiation information.
    Vector ruleList = new Vector();

    // dic2IdHash('word type')= id for word type
    HashMap dic2IdHash = new HashMap();

    // dic2IdHash(valiation) = id for word type
    HashMap rule2IdHash = new HashMap();

    // set flag when last field of valiation isn't '*'
    HashMap isLexcalized = new HashMap();

    public void add(String rule) {
        ruleSet.add(rule);
    }

    public void build() {
        int size = 0;
        // iterate variation
        for (Iterator i = ruleSet.iterator(); i.hasNext();) {
            ruleList.setSize(size + 1);
            String str = (String) i.next();
            rule2IdHash.put(str, new Integer(size));

            // tokenList: split valiation information.
            StringTokenizer st = new StringTokenizer(str, ",");
            int len = st.countTokens();
            String tokenList[] = new String[len];
            for (int j = 0; j < len; j++) {
                tokenList[j] = st.nextToken();
            }

            ruleList.set(size, tokenList);
            //      System.out.println("tokenList="+tokenList[len-1]);
            if (tokenList[len - 1].charAt(0) != '*')
                isLexcalized.put(tokenList[len - 1], "1");
            size++;
        }
        //    System.out.println("size="+size);
        //    System.out.println("ruleList size="+ruleList.size());
        //    System.out.println("test");

        ruleSet.clear();

        idList.setSize(ruleList.size());
        for (int i = 0; i < ruleList.size(); i++) {
            Vector v = new Vector();
            idList.set(i, v);

            getIdList((String[]) ruleList.get(i), (Vector) idList.get(i), 0);
        }
    }

    int getIdList(String csv[], Vector result, int parent) {
        result.setSize(ruleList.size());

        for (int j = 0; j < ruleList.size(); j++)
            result.set(j, new Integer(j));
        //    System.out.println("in:ruleList.size()=" + ruleList.size());
        //    System.out.println("ruleList size="+ruleList.size());
        //    System.out.println("result size="+result.size());
        //    pass

        for (int j = 0; j < csv.length; j++) {
            int k = 0;
            for (int n = 0; n < result.size(); n++) {
                int i = ((Integer) result.get(n)).intValue();
                String rl_ij = ((String[]) ruleList.get(i))[j];
                if ((parent == 0 && csv[j].charAt(0) == '*') || (parent == 1 && rl_ij.charAt(0) == '*')
                        || rl_ij.equals(csv[j])) {

                    result.set(k++, result.get(n));
                }
            }
            result.setSize(k);
        }
        return result.size();
    }

    private int getDicIdNoCache(String csv[]) {
        Vector result = new Vector();

        getIdList(csv, result, 1);

        if (result.size() == 0) {
            log.error("can't find morpheme type");
            log.error("input string is here:");
            log.error("ruleList size=" + ruleList.size());

            StringBuffer buf = new StringBuffer();
            for (int i = 0; i < csv.length; i++) {
                buf.append(csv[i]);
                buf.append(",");

            }
            log.error(buf);
            return -1;
        }

        int priority[] = new int[result.size()];
        int max = 0;
        for (int i = 0; i < result.size(); i++) {
            String v[] = (String[]) ruleList.get(((Integer) result.get(i)).intValue());
            for (int j = 0; j < v.length; j++) {
                if (v[j].charAt(0) != '*')
                    priority[i]++;
            }
            if (priority[max] < priority[i])
                max = i;
            log.debug("detect==");
            log.debug(getById(((Integer) result.get(max)).intValue()));
        }
        return ((Integer) result.get(max)).intValue();
    }

    public int size() {
        return ruleList.size();
    }

    public int getDicId(String rule) {
        CSVParser parser = new CSVParser(rule);
        String csv[] = null;
        try {
            csv = parser.nextTokens();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        String lex = csv[csv.length - 1];
        if (isLexcalized.get(lex) != null) {
            int ret = getDicIdNoCache(csv);
            return ret;
        } else {
            String pos = removeEndField(rule);

            Object r = dic2IdHash.get(pos);
            if (r != null && ((Integer) r).intValue() != 0) {

                int ret = ((Integer) r).intValue() - 1;

                return ret; // 0 if empty
            }

            int rg = getDicIdNoCache(csv);

            log.debug("" + ruleList.size() + ":dic2IdHash(" + pos + ")=" + (rg + 1));

            dic2IdHash.put(pos, new Integer(rg + 1));
            return rg;
        }
    }

    Vector getRuleIdList(String rule) {
        return (Vector) idList.get(((Integer) rule2IdHash.get(rule)).intValue());
    }

    /*
    public static String[] csv2strings(String csv) {
      StringTokenizer st = new StringTokenizer(csv, ",");
      int len = st.countTokens();
      String tokenList[] = new String[len];
      for (int i = 0; i < len; i++) {
        tokenList[i] = st.nextToken();
      }
      return tokenList;
    }
    */

    private static String removeEndField(String str) {
        int field = 0;
        int last = 0;
        for (int i = 0; i < str.length(); i++) {
            if (str.charAt(i) == ',') {
                field++;
                last = i;
            }
        }
        return str.substring(0, last);
    }

    public String getById(int id) {
        String[] r = (String[]) ruleList.get(id);
        StringBuffer buf = new StringBuffer();
        if (r != null) {
            for (int i = 0; i < r.length; i++)
                buf.append(r[i] + ",");
            buf.append("\n");
        } else {
            buf.append("null");
        }
        return null;
    }
}