cc.pp.analyzer.paoding.knife.FileDictionaries.java Source code

Java tutorial

Introduction

Here is the source code for cc.pp.analyzer.paoding.knife.FileDictionaries.java

Source

/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package cc.pp.analyzer.paoding.knife;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import cc.pp.analyzer.paoding.dictionary.BinaryDictionary;
import cc.pp.analyzer.paoding.dictionary.Dictionary;
import cc.pp.analyzer.paoding.dictionary.HashBinaryDictionary;
import cc.pp.analyzer.paoding.dictionary.Hit;
import cc.pp.analyzer.paoding.dictionary.Word;
import cc.pp.analyzer.paoding.dictionary.support.detection.Detector;
import cc.pp.analyzer.paoding.dictionary.support.detection.DifferenceListener;
import cc.pp.analyzer.paoding.dictionary.support.detection.ExtensionFileFilter;
import cc.pp.analyzer.paoding.dictionary.support.filewords.FileWordsReader;
import cc.pp.analyzer.paoding.exception.PaodingAnalysisException;

/**
 * ?,{@link CJKKnife}<br>
 * ?????????????
 * <p>
 *
 * @author Zhiliang Wang [qieqie.wang@gmail.com]
 *
 * @see CJKKnife
 *
 * @since 1.0
 */
public class FileDictionaries implements Dictionaries {

    // -------------------------------------------------

    protected Log log = LogFactory.getLog(this.getClass());

    // -------------------------------------------------

    /**
     * ?
     */
    protected Dictionary vocabularyDictionary;

    /**
     * lantin+cjk?
     */
    protected Dictionary combinatoricsDictionary;

    /**
     * ?
     *
     */
    protected Dictionary confucianFamilyNamesDictionary;

    /**
     * ?
     */
    protected Dictionary noiseCharactorsDictionary;

    /**
     * ?
     *
     */
    protected Dictionary noiseWordsDictionary;

    /**
     * ???
     */
    protected Dictionary unitsDictionary;

    // -------------------------------------------------

    @SuppressWarnings("rawtypes")
    protected Map/* <String, Set<String>> */ allWords;

    protected String dicHome;
    protected String skipPrefix;
    protected String noiseCharactor;
    protected String noiseWord;
    protected String unit;
    protected String confucianFamilyName;
    protected String combinatorics;
    protected String charsetName;

    // ----------------------

    public FileDictionaries() {
    }

    public FileDictionaries(String dicHome, String skipPrefix, String noiseCharactor, String noiseWord, String unit,
            String confucianFamilyName, String combinatorics, String charsetName) {
        this.dicHome = dicHome;
        this.skipPrefix = skipPrefix;
        this.noiseCharactor = noiseCharactor;
        this.noiseWord = noiseWord;
        this.unit = unit;
        this.confucianFamilyName = confucianFamilyName;
        this.combinatorics = combinatorics;
        this.charsetName = charsetName;

    }

    public String getDicHome() {
        return dicHome;
    }

    public void setDicHome(String dicHome) {
        this.dicHome = dicHome;
    }

    public String getSkipPrefix() {
        return skipPrefix;
    }

    public void setSkipPrefix(String skipPrefix) {
        this.skipPrefix = skipPrefix;
    }

    public String getNoiseCharactor() {
        return noiseCharactor;
    }

    public void setNoiseCharactor(String noiseCharactor) {
        this.noiseCharactor = noiseCharactor;
    }

    public String getNoiseWord() {
        return noiseWord;
    }

    public void setNoiseWord(String noiseWord) {
        this.noiseWord = noiseWord;
    }

    public String getUnit() {
        return unit;
    }

    public void setUnit(String unit) {
        this.unit = unit;
    }

    public String getConfucianFamilyName() {
        return confucianFamilyName;
    }

    public void setConfucianFamilyName(String confucianFamilyName) {
        this.confucianFamilyName = confucianFamilyName;
    }

    public String getCharsetName() {
        return charsetName;
    }

    public void setCharsetName(String charsetName) {
        this.charsetName = charsetName;
    }

    public void setLantinFllowedByCjk(String lantinFllowedByCjk) {
        this.combinatorics = lantinFllowedByCjk;
    }

    public String getLantinFllowedByCjk() {
        return combinatorics;
    }

    // -------------------------------------------------

    /**
     * ?
     *
     * @return
     */
    @Override
    public synchronized Dictionary getVocabularyDictionary() {
        if (vocabularyDictionary == null) {
            // 5639??0x2fff=x^13>8000>8000*0.75=6000>5639
            vocabularyDictionary = new HashBinaryDictionary(getVocabularyWords(), 0x2fff, 0.75f);
            Dictionary noiseWordsDic = getNoiseWordsDictionary();
            for (int i = 0; i < noiseWordsDic.size(); i++) {
                Hit hit = vocabularyDictionary.search(noiseWordsDic.get(i), 0, noiseWordsDic.get(i).length());
                if (hit.isHit()) {
                    hit.getWord().setNoiseWord();
                }
            }
            Dictionary noiseCharactorsDic = getNoiseCharactorsDictionary();
            for (int i = 0; i < noiseCharactorsDic.size(); i++) {
                Hit hit = vocabularyDictionary.search(noiseCharactorsDic.get(i), 0,
                        noiseCharactorsDic.get(i).length());
                if (hit.isHit()) {
                    hit.getWord().setNoiseCharactor();
                }
            }

        }
        return vocabularyDictionary;
    }

    /**
     * ?
     *
     * @return
     */
    @Override
    public synchronized Dictionary getConfucianFamilyNamesDictionary() {
        if (confucianFamilyNamesDictionary == null) {
            confucianFamilyNamesDictionary = new BinaryDictionary(getConfucianFamilyNames());
        }
        return confucianFamilyNamesDictionary;
    }

    /**
     * ?
     *
     * @return
     */
    @Override
    public synchronized Dictionary getNoiseCharactorsDictionary() {
        if (noiseCharactorsDictionary == null) {
            noiseCharactorsDictionary = new HashBinaryDictionary(getNoiseCharactors(), 256, 0.75f);
        }
        return noiseCharactorsDictionary;
    }

    /**
     * ?
     *
     * @return
     */
    @Override
    public synchronized Dictionary getNoiseWordsDictionary() {
        if (noiseWordsDictionary == null) {
            noiseWordsDictionary = new BinaryDictionary(getNoiseWords());
        }
        return noiseWordsDictionary;
    }

    /**
     * ???
     *
     * @return
     */
    @Override
    public synchronized Dictionary getUnitsDictionary() {
        if (unitsDictionary == null) {
            unitsDictionary = new HashBinaryDictionary(getUnits(), 1024, 0.75f);
        }
        return unitsDictionary;
    }

    @Override
    public synchronized Dictionary getCombinatoricsDictionary() {
        if (combinatoricsDictionary == null) {
            combinatoricsDictionary = new BinaryDictionary(getCombinatoricsWords());
        }
        return combinatoricsDictionary;
    }

    private Detector detector;

    @Override
    public synchronized void startDetecting(int interval, DifferenceListener l) {
        if (detector != null || interval < 0) {
            return;
        }
        Detector detector = new Detector();
        detector.setHome(dicHome);
        detector.setFilter(new ExtensionFileFilter(".dic"));
        detector.setLastSnapshot(detector.flash());
        detector.setListener(l);
        detector.setInterval(interval);
        detector.start(true);
        this.detector = detector;
    }

    @Override
    public synchronized void stopDetecting() {
        if (detector == null) {
            return;
        }
        detector.setStop();
        detector = null;
    }

    /**
     *
     * @param dicName
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    protected synchronized void refreshDicWords(String dicPath) {
        int index = dicPath.lastIndexOf(".dic");
        String dicName = dicPath.substring(0, index);
        if (allWords != null) {
            try {
                Map/* <String, Set<String>> */ temp = FileWordsReader.readWords(dicHome + dicPath, charsetName);
                allWords.put(dicName, temp.values().iterator().next());
            } catch (FileNotFoundException e) {
                // ????
                allWords.remove(dicName);
            } catch (IOException e) {
                throw toRuntimeException(e);
            }
            if (!isSkipForVacabulary(dicName)) {
                this.vocabularyDictionary = null;
            }
            // ?noiseWord
            if (isNoiseWordDicFile(dicName)) {
                this.noiseWordsDictionary = null;
                // noiseWordvocabulary?vocabulary
                this.vocabularyDictionary = null;
            }
            // ?noiseCharactors
            else if (isNoiseCharactorDicFile(dicName)) {
                this.noiseCharactorsDictionary = null;
                // noiseCharactorsDictionaryvocabulary?vocabulary
                this.vocabularyDictionary = null;
            }
            // ??
            else if (isUnitDicFile(dicName)) {
                this.unitsDictionary = null;
            }
            // ??
            else if (isConfucianFamilyNameDicFile(dicName)) {
                this.confucianFamilyNamesDictionary = null;
            }
            // ??,??
            else if (isLantinFollowedByCjkDicFile(dicName)) {
                this.combinatoricsDictionary = null;
            }
        }
    }

    // ---------------------------------------------------------------
    // ?-?package?

    @SuppressWarnings({ "unchecked", "rawtypes" })
    protected Word[] getVocabularyWords() {
        Map/* <String, Set<Word>> */ dics = loadAllWordsIfNecessary();
        Set/* <Word> */ set = null;
        Iterator/* <Word> */ iter = dics.keySet().iterator();
        while (iter.hasNext()) {
            String name = (String) iter.next();
            if (isSkipForVacabulary(name)) {
                continue;
            }
            Set/* <Word> */ dic = (Set/* <Word> */) dics.get(name);
            if (set == null) {
                set = new HashSet/* <Word> */(dic);
            } else {
                set.addAll(dic);
            }
        }
        Word[] words = (Word[]) set.toArray(new Word[set.size()]);
        Arrays.sort(words);
        return words;
    }

    protected Word[] getConfucianFamilyNames() {
        return getDictionaryWords(confucianFamilyName);
    }

    protected Word[] getNoiseWords() {
        return getDictionaryWords(noiseWord);
    }

    protected Word[] getNoiseCharactors() {
        return getDictionaryWords(noiseCharactor);
    }

    protected Word[] getUnits() {
        return getDictionaryWords(unit);
    }

    protected Word[] getCombinatoricsWords() {
        return getDictionaryWords(combinatorics);
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    protected Word[] getDictionaryWords(String dicNameRelativeDicHome) {
        Map dics;
        try {
            dics = FileWordsReader.readWords(dicHome + "/" + dicNameRelativeDicHome + ".dic", charsetName);
        } catch (IOException e) {
            throw toRuntimeException(e);
        }
        Set/* <Word> */<Word> set = (Set/* <Word> */<Word>) dics.get(dicNameRelativeDicHome);
        Word[] words = set.toArray(new Word[set.size()]);
        Arrays.sort(words);
        return words;
    }

    // -------------------------------------

    /**
     * ???(?????)key
     * dicdivision/china.dickey"division/china"
     */
    @SuppressWarnings("unchecked")
    protected synchronized Map/* <String, Set<String>> */<String, Set<String>> loadAllWordsIfNecessary() {
        if (allWords == null) {
            try {
                log.info("loading dictionaries from " + dicHome);
                allWords = FileWordsReader.readWords(dicHome, charsetName);
                if (allWords.size() == 0) {
                    String message = "Not found any dictionary files, have you set the 'paoding.dic.home' right? ("
                            + this.dicHome + ")";
                    log.error(message);
                    throw new PaodingAnalysisException(message);
                }
                log.info("loaded success!");
            } catch (IOException e) {
                throw toRuntimeException(e);
            }
        }
        return allWords;
    }

    // ---------------------------------------

    protected final boolean isSkipForVacabulary(String dicNameRelativeDicHome) {
        return dicNameRelativeDicHome.startsWith(skipPrefix)
                || dicNameRelativeDicHome.indexOf("/" + skipPrefix) != -1;
    }

    protected boolean isUnitDicFile(String dicName) {
        return dicName.equals(this.unit);
    }

    protected boolean isNoiseCharactorDicFile(String dicName) {
        return dicName.equals(this.noiseCharactor);
    }

    protected boolean isNoiseWordDicFile(String dicName) {
        return dicName.equals(this.noiseWord);
    }

    protected boolean isConfucianFamilyNameDicFile(String dicName) {
        return dicName.equals(this.confucianFamilyName);
    }

    protected boolean isLantinFollowedByCjkDicFile(String dicName) {
        return dicName.equals(this.combinatorics);
    }

    // --------------------------------------

    protected RuntimeException toRuntimeException(IOException e) {
        return new PaodingAnalysisException(e);
    }
}