net.java.sen.tools.MkSenDic.java Source code

Java tutorial

Introduction

Here is the source code for net.java.sen.tools.MkSenDic.java

Source

/*
 * MkChaDic.java - MkChaDic utility to make dictionary.
 * 
 * Copyright (C) 2001, 2002 Taku Kudoh, Takashi Okamoto Taku Kudoh
 * <taku-ku@is.aist-nara.ac.jp> Takashi Okamoto <tora@debian.org>
 * 
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *  
 */

package net.java.sen.tools;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.ResourceBundle;
import java.util.Vector;

import net.java.sen.CToken;
import net.java.sen.util.CSVData;
import net.java.sen.util.CSVParser;
import net.java.sen.util.DoubleArrayTrie;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class MkSenDic {
    private static Log log = LogFactory.getLog(MkSenDic.class);

    /**
     * Build sen dictionary.
     * 
     * @param args
     *            custom dictionary files. see dic/build.xml.
     */
    public static void main(String args[]) {
        ResourceBundle rb = ResourceBundle.getBundle("dictionary");
        DictionaryMaker dm1 = new DictionaryMaker();
        DictionaryMaker dm2 = new DictionaryMaker();
        DictionaryMaker dm3 = new DictionaryMaker();

        // 1st field information of connect file.
        Vector rule1 = new Vector();

        // 2nd field information of connect file.
        Vector rule2 = new Vector();

        // 3rd field information of connect file.
        Vector rule3 = new Vector();

        // 4th field information of connect file.
        // this field shows cost of morpheme connection
        // [size3*(x3*size2+x2)+x1]
        // [size3*(Attr1*size2+Attr2)+Attl]
        short score[] = new short[20131];

        long start = System.currentTimeMillis();

        // /////////////////////////////////////////
        //
        // Step1. Loading connetion file.
        //
        log.info("(1/7): reading connection matrix ... ");
        try {
            log.info("connection file = " + rb.getString("text_connection_file"));
            log.info("charset = " + rb.getString("dic.charset"));
            CSVParser csvparser = new CSVParser(new FileInputStream(rb.getString("text_connection_file")),
                    rb.getString("dic.charset"));
            String t[];
            int line = 0;
            while ((t = csvparser.nextTokens()) != null) {
                if (t.length < 4) {
                    log.warn("invalid line in " + rb.getString("text_connection_file") + ":" + line);
                    log.warn(rb.getString("text_connection_file") + "may be broken.");
                    break;
                }
                dm1.add(t[0]);
                rule1.add(t[0]);

                dm2.add(t[1]);
                rule2.add(t[1]);

                dm3.add(t[2]);
                rule3.add(t[2]);

                if (line == score.length) {
                    score = resize(score);
                }

                score[line++] = (short) Integer.parseInt(t[3]);
            }

            // /////////////////////////////////////////
            //
            // Step2. Building internal dictionary
            //
            log.info("(2/7): building type dictionary ... ");
            dm1.build();
            dm2.build();
            dm3.build();

            // if you want check specified morpheme, you uncomment and modify
            // following line:
            /*
             * System.out.print("22="); dm3.getById(22);
             * System.out.print("368="); dm3.getById(368);
             * 
             * System.out.println(dm3.getDicId("?????*,*,*,*,?"));
             * DictionaryMaker.debug = true;
             * System.out.println(dm3.getDicId("?????*,*,*,*,?"));
             * System.out.println(dm3.getDicIdNoCache("?????*,*,*,*,?"));
             */

        } catch (IOException e) {
            e.printStackTrace();
            System.exit(0);
        }

        // -------------------------------------------------

        int size1 = dm1.size();
        int size2 = dm2.size();
        int size3 = dm3.size();
        int ruleSize = rule1.size();
        short matrix[] = new short[size1 * size2 * size3];
        short default_cost = (short) Integer.parseInt(rb.getString("default_connection_cost"));

        // /////////////////////////////////////////
        //
        // Step3. Writing Connection Matrix
        //
        log.info("(3/7): writing conection matrix (" + size1 + " x " + size2 + " x " + size3 + " = "
                + size1 * size2 * size3 + ") ...");

        for (int i = 0; i < (int) (size1 * size2 * size3); i++)
            matrix[i] = default_cost;

        for (int i = 0; i < ruleSize; i++) {
            Vector r1 = dm1.getRuleIdList((String) rule1.get(i));
            Vector r2 = dm2.getRuleIdList((String) rule2.get(i));
            Vector r3 = dm3.getRuleIdList((String) rule3.get(i));

            for (Iterator i1 = r1.iterator(); i1.hasNext();) {
                int ii1 = ((Integer) i1.next()).intValue();
                for (Iterator i2 = r2.iterator(); i2.hasNext();) {
                    int ii2 = ((Integer) i2.next()).intValue();
                    for (Iterator i3 = r3.iterator(); i3.hasNext();) {
                        int ii3 = ((Integer) i3.next()).intValue();
                        int pos = size3 * (size2 * ii1 + ii2) + ii3;
                        matrix[pos] = score[i];
                    }
                }
            }
        }

        try {
            DataOutputStream out = new DataOutputStream(
                    new BufferedOutputStream(new FileOutputStream(rb.getString("matrix_file"))));
            out.writeShort(size1);
            out.writeShort(size2);
            out.writeShort(size3);
            for (int i1 = 0; i1 < size1; i1++)
                for (int i2 = 0; i2 < size2; i2++)
                    for (int i3 = 0; i3 < size3; i3++) {
                        out.writeShort(matrix[size3 * (size2 * i1 + i2) + i3]);
                        // if (matrix[size3 * (size2 * i1 + i2) + i3] !=
                        // default_cost) {
                        // }
                    }
            out.close();
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(0);
        }

        matrix = null;
        score = null;

        // -------------------------------------------------

        int pos_start = Integer.parseInt(rb.getString("pos_start"));
        int pos_size = Integer.parseInt(rb.getString("pos_size"));

        int di = 0;
        int offset = 0;
        ArrayList dicList = new ArrayList();

        // /////////////////////////////////////////
        //
        // Step4. Reading Morpheme Information
        //
        log.info("(4/7): reading morpheme information ... ");
        String t = null;
        String[] csv = null;
        try {
            // writer for feature file.
            BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
                    new FileOutputStream(rb.getString("pos_file")), rb.getString("sen.charset")));

            log.info("load dic: " + rb.getString("text_dic_file"));
            BufferedReader dicStream = null;
            int custom_dic = -1;
            if (args.length == 0) {
                dicStream = new BufferedReader(new InputStreamReader(
                        new FileInputStream(rb.getString("text_dic_file")), rb.getString("dic.charset")));
            } else {
                custom_dic = 0;
                dicStream = new BufferedReader(
                        new InputStreamReader(new FileInputStream(args[custom_dic]), rb.getString("dic.charset")));
            }

            int line = 0;

            CSVData key_b = new CSVData();
            CSVData pos_b = new CSVData();

            while (true) {
                t = dicStream.readLine();
                if (t == null) {
                    dicStream.close();
                    custom_dic++;
                    if (args.length == custom_dic) {
                        break;
                    } else {
                        // read custum dictionary
                        log.info("load dic: " + "args[custum_dic]");
                        dicStream = new BufferedReader(new InputStreamReader(new FileInputStream(args[custom_dic]),
                                rb.getString("dic.charset")));
                    }
                    continue;
                }

                CSVParser parser = new CSVParser(t);
                csv = parser.nextTokens();
                if (csv.length < (pos_size + pos_start)) {
                    throw new RuntimeException("format error:" + t);
                }

                key_b.clear();
                pos_b.clear();
                for (int i = pos_start; i < (pos_start + pos_size - 1); i++) {
                    key_b.append(csv[i]);
                    pos_b.append(csv[i]);
                }

                key_b.append(csv[pos_start + pos_size - 1]);
                pos_b.append(csv[pos_start + pos_size - 1]);

                for (int i = pos_start + pos_size; i < (csv.length - 1); i++) {
                    pos_b.append(csv[i]);
                }
                pos_b.append(csv[csv.length - 1]);

                CToken token = new CToken();

                token.rcAttr2 = (short) dm1.getDicId(key_b.toString());
                token.rcAttr1 = (short) dm2.getDicId(key_b.toString());
                token.lcAttr = (short) dm3.getDicId(key_b.toString());
                token.posid = 0;
                token.posID = offset;
                token.length = (short) csv[0].length();
                token.cost = (short) Integer.parseInt(csv[1]);

                dicList.add(new PairObject(csv[0], token));

                byte b[] = pos_b.toString().getBytes(rb.getString("sen.charset"));
                offset += (b.length + 1);
                String pos_b_str = pos_b.toString();
                bw.write(pos_b_str, 0, pos_b_str.length());
                // bw.write(b, 0, b.length);
                bw.write(0);
                if (++di % 50000 == 0)
                    log.info("" + di + "... ");
            }
            bw.close();
            // ----end of writing feature.cha ----
        } catch (Exception e) {
            log.error("Error: " + t);
            e.printStackTrace();
            System.exit(1);
        }

        rule1 = null;
        rule2 = null;
        rule3 = null;

        // /////////////////////////////////////////
        //
        // Step5. Sort lexs and write to file
        //
        log.info("(5/7): sorting lex... ");

        int value[] = new int[dicList.size()];
        char key[][] = new char[dicList.size()][];
        int spos = 0;
        int dsize = 0;
        int bsize = 0;
        String prev = "";
        Collections.sort(dicList);

        // /////////////////////////////////////////
        //
        // Step6. Writing Token Information
        //
        log.info("(6/7): writing token... ");
        try {
            // writer for token file.
            DataOutputStream out = new DataOutputStream(
                    new BufferedOutputStream(new FileOutputStream(rb.getString("token_file"))));

            // writing 'bos' and 'eos' and 'unknown' token.
            CToken token = new CToken();
            token.rcAttr2 = (short) dm1.getDicId(rb.getString("bos_pos"));
            token.rcAttr1 = (short) dm2.getDicId(rb.getString("bos_pos"));
            token.lcAttr = (short) dm3.getDicId(rb.getString("bos_pos"));
            token.write(out);

            token.rcAttr2 = (short) dm1.getDicId(rb.getString("eos_pos"));
            token.rcAttr1 = (short) dm2.getDicId(rb.getString("eos_pos"));
            token.lcAttr = (short) dm3.getDicId(rb.getString("eos_pos"));
            token.write(out);

            token.rcAttr2 = (short) dm1.getDicId(rb.getString("unknown_pos"));
            token.rcAttr1 = (short) dm2.getDicId(rb.getString("unknown_pos"));
            token.lcAttr = (short) dm3.getDicId(rb.getString("unknown_pos"));
            token.posID = -1;
            token.write(out);
            log.info("key size = " + key.length);
            for (int i = 0; i < key.length; i++) {
                String k = (String) ((PairObject) dicList.get(i)).key;
                if (!prev.equals(k) && i != 0) {
                    key[dsize] = ((String) ((PairObject) dicList.get(spos)).key).toCharArray();
                    value[dsize] = bsize + (spos << 8);
                    dsize++;
                    bsize = 1;
                    spos = i;
                } else {
                    bsize++;
                }
                prev = (String) ((PairObject) dicList.get(i)).key;
                ((CToken) (((PairObject) dicList.get(i)).value)).write(out);
            }
            out.flush();
            out.close();
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }

        key[dsize] = ((String) ((PairObject) dicList.get(spos)).key).toCharArray();

        value[dsize] = bsize + (spos << 8);
        dsize++;

        dm1 = null;
        dm2 = null;
        dm3 = null;
        dicList = null;

        // /////////////////////////////////////////
        //
        // Step7. Build Double Array
        //
        log.info("(7/7): building Double-Array (size = " + dsize + ") ...");

        DoubleArrayTrie da = new DoubleArrayTrie();

        da.build(key, null, value, dsize);
        try {
            da.save(rb.getString("double_array_file"));
        } catch (Exception e) {
            e.printStackTrace();
        }

        log.info("total time = " + (System.currentTimeMillis() - start) / 1000 + "[ms]");
    }

    private static short[] resize(short s[]) {
        short newbuf[] = new short[(int) (s.length * 1.5)];
        for (int i = 0; i < s.length; i++) {
            newbuf[i] = s[i];
        }
        return newbuf;
    }
}