Java tutorial
/** * IK ? 5.0 * IK Analyzer release 5.0 * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * ??(linliangyi2005@gmail.com)?? * ? 2012 * provided by Linliangyi and copyright 2012 by Oolong studio * * */ package org.wltea.analyzer.dic; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.file.Path; import java.util.*; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.elasticsearch.common.io.PathUtils; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin; import org.wltea.analyzer.cfg.Configuration; /** * ??,??? */ public class Dictionary { /* * ??? */ private static Dictionary singleton; private DictSegment _MainDict; private DictSegment _SurnameDict; private DictSegment _QuantifierDict; private DictSegment _SuffixDict; private DictSegment _PrepDict; private DictSegment _StopWords; /** * ? */ private Configuration configuration; public static ESLogger logger = Loggers.getLogger("ik-analyzer"); private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1); public static final String PATH_DIC_MAIN = "main.dic"; public static final String PATH_DIC_SURNAME = "surname.dic"; public static final String PATH_DIC_QUANTIFIER = "quantifier.dic"; public static final String PATH_DIC_SUFFIX = "suffix.dic"; public static final String PATH_DIC_PREP = "preposition.dic"; public static final String PATH_DIC_STOP = "stopword.dic"; private final static String FILE_NAME = "IKAnalyzer.cfg.xml"; private final static String EXT_DICT = "ext_dict"; private final static String REMOTE_EXT_DICT = "remote_ext_dict"; private final static String EXT_STOP = "ext_stopwords"; private final static String REMOTE_EXT_STOP = "remote_ext_stopwords"; private Path conf_dir; private Properties props; private Dictionary(Configuration cfg) { this.configuration = cfg; this.props = new Properties(); this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME); Path configFile = conf_dir.resolve(FILE_NAME); InputStream input = null; try { logger.info("try load config from {}", configFile); input = new FileInputStream(configFile.toFile()); } catch (FileNotFoundException e) { conf_dir = cfg.getConfigInPluginDir(); configFile = conf_dir.resolve(FILE_NAME); try { logger.info("try load config from {}", configFile); input = new FileInputStream(configFile.toFile()); } catch (FileNotFoundException ex) { // We should report origin exception logger.error("ik-analyzer", e); } } if (input != null) { try { props.loadFromXML(input); } catch (InvalidPropertiesFormatException e) { logger.error("ik-analyzer", e); } catch (IOException e) { logger.error("ik-analyzer", e); } } } public String getProperty(String key) { if (props != null) { return props.getProperty(key); } return null; } /** * ?? IK Analyzer?Dictionary???? * ?Dictionary?? ?? ??? * * @return Dictionary */ public static synchronized Dictionary initial(Configuration cfg) { if (singleton == null) { synchronized (Dictionary.class) { if (singleton == null) { singleton = new Dictionary(cfg); singleton.loadMainDict(); singleton.loadSurnameDict(); singleton.loadQuantifierDict(); singleton.loadSuffixDict(); singleton.loadPrepDict(); singleton.loadStopWordDict(); if (cfg.isEnableRemoteDict()) { // for (String location : singleton.getRemoteExtDictionarys()) { // 10 ?? 60 ?? pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); } for (String location : singleton.getRemoteExtStopWordDictionarys()) { pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); } } return singleton; } } } return singleton; } public List<String> getExtDictionarys() { List<String> extDictFiles = new ArrayList<String>(2); String extDictCfg = getProperty(EXT_DICT); if (extDictCfg != null) { String[] filePaths = extDictCfg.split(";"); for (String filePath : filePaths) { if (filePath != null && !"".equals(filePath.trim())) { Path file = PathUtils.get(filePath.trim()); extDictFiles.add(file.toString()); } } } return extDictFiles; } public List<String> getRemoteExtDictionarys() { List<String> remoteExtDictFiles = new ArrayList<String>(2); String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT); if (remoteExtDictCfg != null) { String[] filePaths = remoteExtDictCfg.split(";"); for (String filePath : filePaths) { if (filePath != null && !"".equals(filePath.trim())) { remoteExtDictFiles.add(filePath); } } } return remoteExtDictFiles; } public List<String> getExtStopWordDictionarys() { List<String> extStopWordDictFiles = new ArrayList<String>(2); String extStopWordDictCfg = getProperty(EXT_STOP); if (extStopWordDictCfg != null) { String[] filePaths = extStopWordDictCfg.split(";"); for (String filePath : filePaths) { if (filePath != null && !"".equals(filePath.trim())) { Path file = PathUtils.get(filePath.trim()); extStopWordDictFiles.add(file.toString()); } } } return extStopWordDictFiles; } public List<String> getRemoteExtStopWordDictionarys() { List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2); String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP); if (remoteExtStopWordDictCfg != null) { String[] filePaths = remoteExtStopWordDictCfg.split(";"); for (String filePath : filePaths) { if (filePath != null && !"".equals(filePath.trim())) { remoteExtStopWordDictFiles.add(filePath); } } } return remoteExtStopWordDictFiles; } public String getDictRoot() { return conf_dir.toAbsolutePath().toString(); } /** * ???? * * @return Dictionary ? */ public static Dictionary getSingleton() { if (singleton == null) { throw new IllegalStateException("??initial"); } return singleton; } /** * ??? * * @param words * Collection<String>?? */ public void addWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // ???? singleton._MainDict.fillSegment(word.trim().toCharArray()); } } } } /** * ???? */ public void disableWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // ???? singleton._MainDict.disableSegment(word.trim().toCharArray()); } } } } /** * ?? * * @return Hit ??? */ public Hit matchInMainDict(char[] charArray) { return singleton._MainDict.match(charArray); } /** * ?? * * @return Hit ??? */ public Hit matchInMainDict(char[] charArray, int begin, int length) { return singleton._MainDict.match(charArray, begin, length); } /** * ???? * * @return Hit ??? */ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) { return singleton._QuantifierDict.match(charArray, begin, length); } /** * ?Hit?DictSegment?? * * @return Hit */ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { DictSegment ds = matchedHit.getMatchedDictSegment(); return ds.match(charArray, currentIndex, 1, matchedHit); } /** * ??? * * @return boolean */ public boolean isStopWord(char[] charArray, int begin, int length) { return singleton._StopWords.match(charArray, begin, length).isMatch(); } /** * ??? */ private void loadMainDict() { // ? _MainDict = new DictSegment((char) 0); // ?? Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN); InputStream is = null; try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { logger.error(e.getMessage(), e); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _MainDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } // ? this.loadExtDict(); // ? this.loadRemoteExtDict(); } /** * ??? */ private void loadExtDict() { // ?? List<String> extDictFiles = getExtDictionarys(); if (extDictFiles != null) { InputStream is = null; for (String extDictName : extDictFiles) { // ?? logger.info("[Dict Loading] " + extDictName); Path file = PathUtils.get(getDictRoot(), extDictName); try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } // ? if (is == null) { continue; } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { // ??? _MainDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } } } } /** * ?? */ private void loadRemoteExtDict() { List<String> remoteExtDictFiles = getRemoteExtDictionarys(); for (String location : remoteExtDictFiles) { logger.info("[Dict Loading] " + location); List<String> lists = getRemoteWords(location); // ? if (lists == null) { logger.error("[Dict Loading] " + location + ""); continue; } for (String theWord : lists) { if (theWord != null && !"".equals(theWord.trim())) { // ??? logger.info(theWord); _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } } } /** * ??? */ private static List<String> getRemoteWords(String location) { List<String> buffer = new ArrayList<String>(); RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000) .setConnectTimeout(10 * 1000).setSocketTimeout(60 * 1000).build(); CloseableHttpClient httpclient = HttpClients.createDefault(); CloseableHttpResponse response; BufferedReader in; HttpGet get = new HttpGet(location); get.setConfig(rc); try { response = httpclient.execute(get); if (response.getStatusLine().getStatusCode() == 200) { String charset = "UTF-8"; // ??utf-8 if (response.getEntity().getContentType().getValue().contains("charset=")) { String contentType = response.getEntity().getContentType().getValue(); charset = contentType.substring(contentType.lastIndexOf("=") + 1); } in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), charset)); String line; while ((line = in.readLine()) != null) { buffer.add(line); } in.close(); response.close(); return buffer; } response.close(); } catch (ClientProtocolException e) { logger.error("getRemoteWords {} error", e, location); } catch (IllegalStateException e) { logger.error("getRemoteWords {} error", e, location); } catch (IOException e) { logger.error("getRemoteWords {} error", e, location); } return buffer; } /** * ??? */ private void loadStopWordDict() { // ? _StopWords = new DictSegment((char) 0); // ?? Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP); InputStream is = null; try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { logger.error(e.getMessage(), e); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _StopWords.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } // ?? List<String> extStopWordDictFiles = getExtStopWordDictionarys(); if (extStopWordDictFiles != null) { is = null; for (String extStopWordDictName : extStopWordDictFiles) { logger.info("[Dict Loading] " + extStopWordDictName); // ?? file = PathUtils.get(getDictRoot(), extStopWordDictName); try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } // ? if (is == null) { continue; } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { // ??? _StopWords.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } } } // ?? List<String> remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys(); for (String location : remoteExtStopWordDictFiles) { logger.info("[Dict Loading] " + location); List<String> lists = getRemoteWords(location); // ? if (lists == null) { logger.error("[Dict Loading] " + location + ""); continue; } for (String theWord : lists) { if (theWord != null && !"".equals(theWord.trim())) { // ?? logger.info(theWord); _StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } } } /** * ??? */ private void loadQuantifierDict() { // ?? _QuantifierDict = new DictSegment((char) 0); // ???? Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER); InputStream is = null; try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _QuantifierDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { logger.error("Quantifier Dictionary loading exception."); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } } private void loadSurnameDict() { _SurnameDict = new DictSegment((char) 0); Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME); InputStream is = null; try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } if (is == null) { throw new RuntimeException("Surname Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _SurnameDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } } private void loadSuffixDict() { _SuffixDict = new DictSegment((char) 0); Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX); InputStream is = null; try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } if (is == null) { throw new RuntimeException("Suffix Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _SuffixDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { is.close(); is = null; } catch (IOException e) { logger.error("ik-analyzer", e); } } } private void loadPrepDict() { _PrepDict = new DictSegment((char) 0); Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP); InputStream is = null; try { is = new FileInputStream(file.toFile()); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } if (is == null) { throw new RuntimeException("Preposition Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _PrepDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { is.close(); is = null; } catch (IOException e) { logger.error("ik-analyzer", e); } } } public void reLoadMainDict() { logger.info("??..."); // ????? Dictionary tmpDict = new Dictionary(configuration); tmpDict.configuration = getSingleton().configuration; tmpDict.loadMainDict(); tmpDict.loadStopWordDict(); _MainDict = tmpDict._MainDict; _StopWords = tmpDict._StopWords; logger.info("??..."); } }