org.wltea.analyzer.dic.Dictionary.java Source code

Java tutorial

Introduction

Here is the source code for org.wltea.analyzer.dic.Dictionary.java

Source

/**
 * IK ?   5.0
 * IK Analyzer release 5.0
 * 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * ??(linliangyi2005@gmail.com)??
 * ? 2012
 * provided by Linliangyi and copyright 2012 by Oolong studio
 * 
 * 
 */
package org.wltea.analyzer.dic;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;

import org.apache.log4j.Logger;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.ZooKeeper;
import org.wltea.analyzer.cfg.Configuration;

/**
 * ??,???
 */
/**
 * @author kevin
 *
 */
public class Dictionary implements Watcher {

    private static Logger logger = Logger.getLogger(Dictionary.class);

    private static final String ZK_DIC_ROOT = "/dictionary";
    private static final String ZK_DIC_EXT = ZK_DIC_ROOT + "/ext";
    private static final String ZK_DIC_STOP = ZK_DIC_ROOT + "/stop";

    /*
     * ???
     */
    private static Dictionary singleton;

    /*
     * ?
     */
    private DictSegment _MainDict;

    /*
     * ??? 
     */
    private DictSegment _StopWordDict;
    /*
     * ???
     */
    private DictSegment _QuantifierDict;

    /**
     * ?
     */
    private Configuration cfg;

    /**
     * 
     */
    private ZooKeeper zk;

    private Dictionary(Configuration cfg) {
        this.cfg = cfg;
        this.initZk();
        this.loadMainDict();
        //this.loadStopWordDict();
        this.loadQuantifierDict();
    }

    /**
     * ??
     * IK Analyzer?Dictionary????
     * ?Dictionary??
     * ??
     * ???
     * @return Dictionary
     */
    public static Dictionary initial(Configuration cfg) {
        if (singleton == null) {
            synchronized (Dictionary.class) {
                if (singleton == null) {
                    singleton = new Dictionary(cfg);
                    return singleton;
                }
            }
        }
        return singleton;
    }

    /**
     * ????
     * @return Dictionary ?
     */
    public static Dictionary getSingleton() {
        if (singleton == null) {
            throw new IllegalStateException("??initial");
        }
        return singleton;
    }

    private void initZk() {
        String zkhost = System.getProperty("zkHost");
        try {
            this.zk = new ZooKeeper(zkhost, 5 * 1000, this);

            if (zk.exists(ZK_DIC_ROOT, false) == null) {
                zk.create(ZK_DIC_ROOT, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
            }

            if (zk.exists(ZK_DIC_EXT + ".add", true) == null) {
                zk.create(ZK_DIC_EXT + ".add", null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
            }

            if (zk.exists(ZK_DIC_EXT + ".del", true) == null) {
                zk.create(ZK_DIC_EXT + ".del", null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
            }

            if (zk.exists(ZK_DIC_STOP + ".add", true) == null) {
                zk.create(ZK_DIC_STOP + ".add", null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
            }

            if (zk.exists(ZK_DIC_STOP + ".del", true) == null) {
                zk.create(ZK_DIC_STOP + ".del", null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (KeeperException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * ???
     * @param words Collection<String>??
     * @throws InterruptedException 
     * @throws KeeperException 
     */
    public void addWordsToMainDict(Collection<String> words) throws KeeperException, InterruptedException {
        if (words != null) {
            StringBuffer buf = new StringBuffer();
            for (String word : words) {
                if (word != null) {
                    //????
                    buf.append(word).append("\n");
                }
            }
            this.zk.setData(ZK_DIC_EXT + ".add", buf.toString().getBytes(), -1);
        }
    }

    /**
     * ????
     * @param words
     * @throws InterruptedException 
     * @throws KeeperException 
     */
    public void disableWordsFromMainDict(Collection<String> words) throws KeeperException, InterruptedException {
        if (words != null) {
            StringBuffer buf = new StringBuffer();
            for (String word : words) {
                if (word != null) {
                    //????
                    this.zk.setData(ZK_DIC_EXT + ".del", buf.toString().getBytes(), -1);
                }
            }
        }
    }

    /**
     * ???
     * @param words Collection<String>??
     * @throws InterruptedException 
     * @throws KeeperException 
     */
    public void addWordsToStopDict(Collection<String> words) throws KeeperException, InterruptedException {
        if (words != null) {
            StringBuffer buf = new StringBuffer();
            for (String word : words) {
                if (word != null) {
                    //????
                    buf.append(word).append("\n");
                }
            }
            this.zk.setData(ZK_DIC_STOP + ".add", buf.toString().getBytes(), -1);
        }
    }

    /**
     * ????
     * @param words
     * @throws InterruptedException 
     * @throws KeeperException 
     */
    public void disableWordsFromStopDict(Collection<String> words) throws KeeperException, InterruptedException {
        if (words != null) {
            StringBuffer buf = new StringBuffer();
            for (String word : words) {
                if (word != null) {
                    //????
                    this.zk.setData(ZK_DIC_STOP + ".del", buf.toString().getBytes(), -1);
                }
            }
        }
    }

    /**
     * ??
     * @param charArray
     * @return Hit ???
     */
    public Hit matchInMainDict(char[] charArray) {
        return singleton._MainDict.match(charArray);
    }

    /**
     * ??
     * @param charArray
     * @param begin
     * @param length
     * @return Hit ???
     */
    public Hit matchInMainDict(char[] charArray, int begin, int length) {
        return singleton._MainDict.match(charArray, begin, length);
    }

    /**
     * ????
     * @param charArray
     * @param begin
     * @param length
     * @return Hit ???
     */
    public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
        return singleton._QuantifierDict.match(charArray, begin, length);
    }

    /**
     * ?Hit?DictSegment??
     * @param charArray
     * @param currentIndex
     * @param matchedHit
     * @return Hit
     */
    public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
        DictSegment ds = matchedHit.getMatchedDictSegment();
        return ds.match(charArray, currentIndex, 1, matchedHit);
    }

    /**
     * ???
     * @param charArray
     * @param begin
     * @param length
     * @return boolean
     */
    public boolean isStopWord(char[] charArray, int begin, int length) {
        return singleton._StopWordDict.match(charArray, begin, length).isMatch();
    }

    /**
     * ???
     */
    private void loadMainDict() {
        //?
        _MainDict = new DictSegment((char) 0);
        //??
        InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary());
        if (is == null) {
            throw new RuntimeException("Main Dictionary not found!!!");
        }

        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
            String theWord = null;
            do {
                theWord = br.readLine();
                if (theWord != null && !"".equals(theWord.trim())) {
                    _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                }
            } while (theWord != null);

        } catch (IOException ioe) {
            System.err.println("Main Dictionary loading exception.");
            ioe.printStackTrace();

        } finally {
            try {
                if (is != null) {
                    is.close();
                    is = null;
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        //?
        //this.loadExtDict();
    }

    /**
     * ???
     */
    private void loadExtDict() {
        //??
        List<String> extDictFiles = cfg.getExtDictionarys();
        if (extDictFiles != null) {
            InputStream is = null;
            for (String extDictName : extDictFiles) {
                //??
                System.out.println("?" + extDictName);
                is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
                //?
                if (is == null) {
                    continue;
                }
                try {
                    BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
                    String theWord = null;
                    do {
                        theWord = br.readLine();
                        if (theWord != null && !"".equals(theWord.trim())) {
                            //???
                            //System.out.println(theWord);
                            _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                        }
                    } while (theWord != null);

                } catch (IOException ioe) {
                    System.err.println("Extension Dictionary loading exception.");
                    ioe.printStackTrace();

                } finally {
                    try {
                        if (is != null) {
                            is.close();
                            is = null;
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }
    }

    /**
     * ???
     */
    private void loadStopWordDict() {
        //?
        _StopWordDict = new DictSegment((char) 0);
        //??
        List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys();
        if (extStopWordDictFiles != null) {
            InputStream is = null;
            for (String extStopWordDictName : extStopWordDictFiles) {
                System.out.println("??" + extStopWordDictName);
                //??
                is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
                //?
                if (is == null) {
                    continue;
                }
                try {
                    BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
                    String theWord = null;
                    do {
                        theWord = br.readLine();
                        if (theWord != null && !"".equals(theWord.trim())) {
                            //System.out.println(theWord);
                            //???
                            _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                        }
                    } while (theWord != null);

                } catch (IOException ioe) {
                    System.err.println("Extension Stop word Dictionary loading exception.");
                    ioe.printStackTrace();

                } finally {
                    try {
                        if (is != null) {
                            is.close();
                            is = null;
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }
    }

    /**
     * ???
     */
    private void loadQuantifierDict() {
        //??
        _QuantifierDict = new DictSegment((char) 0);
        //????
        InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
        if (is == null) {
            throw new RuntimeException("Quantifier Dictionary not found!!!");
        }
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
            String theWord = null;
            do {
                theWord = br.readLine();
                if (theWord != null && !"".equals(theWord.trim())) {
                    _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                }
            } while (theWord != null);

        } catch (IOException ioe) {
            System.err.println("Quantifier Dictionary loading exception.");
            ioe.printStackTrace();

        } finally {
            try {
                if (is != null) {
                    is.close();
                    is = null;
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * ??
     * @param words
     * @param update   true:?false:?
     * @param dict
     */
    private void updateWordsToDict(Collection<String> words, boolean update, DictSegment dict) {
        if (words != null) {
            logger.info("Words : " + words.toString());
            for (String word : words) {
                if (word != null) {
                    //????
                    if (update) {
                        dict.fillSegment(word.trim().toLowerCase().toCharArray());
                    } else {
                        dict.disableSegment(word.trim().toLowerCase().toCharArray());
                    }
                }
            }
        }
    }

    /**
     * zookeeper?????
     * @param path
     * @return
     */
    private Collection<String> getDataFromZkFile(String path) {
        Collection<String> set = new HashSet<String>();
        byte[] buf = null;
        try {
            buf = this.zk.getData(path, true, null);
        } catch (KeeperException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        if (buf != null) {
            String[] words = new String(buf).split("\n");
            for (int i = 0; i < words.length; i++) {
                set.add(words[i]);
            }
        }
        return set;
    }

    /* (non-Javadoc)
     * @see org.apache.zookeeper.Watcher#process(org.apache.zookeeper.WatchedEvent)
     */
    public void process(WatchedEvent event) {
        // TODO Auto-generated method stub
        logger.debug("path: " + event.getPath() + "   EventType: " + event.getType());
        if (event.getType() == Watcher.Event.EventType.NodeDataChanged && event.getPath() != null) {
            String path = event.getPath();
            logger.debug("path: " + path);
            if (path.equals(ZK_DIC_EXT + ".add")) {
                Collection<String> words = this.getDataFromZkFile(path);
                this.updateWordsToDict(words, true, singleton._MainDict);
            } else if (path.equals(ZK_DIC_EXT + ".del")) {
                Collection<String> words = this.getDataFromZkFile(path);
                this.updateWordsToDict(words, false, singleton._MainDict);
            } else if (path.equals(ZK_DIC_STOP + ".add")) {
                Collection<String> words = this.getDataFromZkFile(path);
                this.updateWordsToDict(words, true, singleton._StopWordDict);
            } else if (path.equals(ZK_DIC_STOP + ".del")) {
                Collection<String> words = this.getDataFromZkFile(path);
                this.updateWordsToDict(words, true, singleton._StopWordDict);
            }
        }
    }

}