Java tutorial
/* * Copyright 2011-2016 ZXC.com All right reserved. This software is the confidential and proprietary information of * ZXC.com ("Confidential Information"). You shall not disclose such Confidential Information and shall use it only in * accordance with the terms of the license agreement you entered into with ZXC.com. */ package com.ms.commons.fasttext; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.ms.commons.fasttext.codec.HtmlFastEntities; import com.ms.commons.fasttext.decorator.DecoratorCallback; import com.ms.commons.fasttext.extract.CharNormalization; import com.ms.commons.fasttext.psoriasis.HTMLParserExtractText; import com.ms.commons.fasttext.psoriasis.MappedDecoratorText; import com.ms.commons.fasttext.psoriasis.PsoriasisUtil; import com.ms.commons.fasttext.psoriasis.SkipTermExtraInfo; import com.ms.commons.fasttext.psoriasis.WordTransformer; import com.ms.commons.fasttext.text.PinyinUtil; /** * ??????<br> * <br> * html<br> * * @author zxc Apr 12, 2013 3:25:32 PM */ public class FasttextService { private static Log logger = LogFactory.getLog(FasttextService.class); // Systemkey public static final String BANNED_DIR = "banneddir"; // ??? public static final String BANNEDWORD = "BANNEDWORD_U8.TXT"; // ?? public static final String HOMOPHONE = "HOMOPHONE_U8.TXT"; // ? public static final String RADICALDIC = "RADICALDIC_U8.TXT"; public static final String DEFAULT_REPLACE_STR = "***"; // private static MappedDecoratorText mdt; // ??? private static DefaultDecoratorCallback defaultDecoratorCallback; // private static HTMLParserExtractText pet = new HTMLParserExtractText(); private static final int DELAY_TIME = 10 * 60; private static long weijinLastModified; private static long forkLastModified; private static long homoLastModified; static { // System.setProperty(BANNED_DIR, "/Users/zxc/msun/"); loadText(); createThreadPool(); } private static void createThreadPool() { Executors.newSingleThreadScheduledExecutor().scheduleWithFixedDelay(new Runnable() { public void run() { reload(); } }, DELAY_TIME, DELAY_TIME, TimeUnit.SECONDS); } private static void reload() { logger.info("????..."); if (!checkFile(false)) { logger.info("????,?"); return; } String path = System.getProperty(BANNED_DIR); String weijinfilename = path + File.separator + BANNEDWORD; String forkfilename = path + File.separator + RADICALDIC; String homofilename = path + File.separator + HOMOPHONE; File wf = new File(weijinfilename); File ff = new File(forkfilename); File hf = new File(homofilename); if (weijinLastModified != wf.lastModified() || forkLastModified != ff.lastModified() || homoLastModified != hf.lastModified()) { // ?,? logger.info("??,????....?..."); loadText(); logger.info("????....?..."); } else { logger.info("?,?..."); } } private static void loadText() { String path = System.getProperty(BANNED_DIR); List<String> pinying; List<String> weijin; List<String> fork; List<String> homo; // path = "/Users/hanjie/Documents/bannedword"; pinying = PinyinUtil.loadListFromFile("data", "PINYING_U8.TXT", "utf8"); if (!checkFile(true)) { // weijin = loadListFromFile("data", BANNEDWORD, "utf8"); fork = loadListFromFile("data", RADICALDIC, "utf8"); homo = loadListFromFile("data", HOMOPHONE, "utf8"); } else { String filename = path + File.separator + BANNEDWORD; weijin = readList(filename, "utf8"); filename = path + File.separator + RADICALDIC; fork = readList(filename, "utf8"); filename = path + File.separator + HOMOPHONE; homo = readList(filename, "utf8"); } WordTransformer transform = new WordTransformer(pinying, fork, null, homo); List<SkipTermExtraInfo> radList = new ArrayList<SkipTermExtraInfo>(); List<SkipTermExtraInfo> weijinList = PsoriasisUtil.loadDic(weijin); for (SkipTermExtraInfo skipTermExtraInfo : weijinList) { radList.add(skipTermExtraInfo); // +??? ??? radList.addAll(transform.transformPinyinWords(skipTermExtraInfo.getWord())); // ?+??? ???? radList.addAll(transform.transformForkWords(skipTermExtraInfo.getWord())); // ??+??? ????? List<SkipTermExtraInfo> hm = transform.transformHomophoneWords(skipTermExtraInfo.getWord()); radList.addAll(hm); // ??++??? for (SkipTermExtraInfo skipTermExtraInfo2 : hm) { radList.addAll(transform.transformPinyinWords(skipTermExtraInfo2.getWord())); } } Properties props = new Properties(); mdt = new MappedDecoratorText(radList, props); defaultDecoratorCallback = new DefaultDecoratorCallback(); } private static boolean checkFile(boolean isInit) { String path = System.getProperty(BANNED_DIR); if (path == null || path.trim().length() == 0) { return false; } String weijinfilename = path + File.separator + BANNEDWORD; String forkfilename = path + File.separator + RADICALDIC; String homofilename = path + File.separator + HOMOPHONE; File wf = new File(weijinfilename); File ff = new File(forkfilename); File hf = new File(homofilename); boolean flag = wf.exists() && ff.exists() && hf.exists(); if (flag && isInit) { weijinLastModified = wf.lastModified(); forkLastModified = ff.lastModified(); homoLastModified = hf.lastModified(); } return flag; } private static List<String> readList(String resource, String encoding) { if (resource == null) { return null; } List<String> list = new ArrayList<String>(); BufferedReader br = null; if (encoding == null || encoding.equals("")) { encoding = "UTF-8"; } try { br = new BufferedReader(new InputStreamReader(new FileInputStream(resource), encoding)); String line = null; while ((line = br.readLine()) != null) { list.add(line.trim()); } } catch (UnsupportedEncodingException e) { logger.error("read " + resource + "fail...", e); throw new RuntimeException(e); } catch (IOException e) { logger.error("read " + resource + "fail...", e); throw new RuntimeException(e); } finally { if (br != null) { try { br.close(); } catch (IOException e) { } } } return list; } private static final List<String> loadListFromFile(String subfold, String file, String encoding) { String pckName = FasttextService.class.getPackage().getName(); file = "/" + pckName.replace('.', '/') + "/" + subfold + "/" + file; InputStream istream = FasttextService.class.getResourceAsStream(file); if (istream == null) { throw new RuntimeException("Could not find file: " + file); } List<String> ret = new ArrayList<String>(); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(istream, encoding)); String line = reader.readLine(); while (line != null) { if (line.trim().length() > 0) { ret.add(line); } line = reader.readLine(); } } catch (Exception e) { throw new RuntimeException(e); } finally { try { if (reader != null) { reader.close(); } else { if (istream != null) { istream.close(); } } } catch (Exception e2) { throw new RuntimeException("close stream failed", e2); } } return ret; } /** * ???***????? * * @param content * @return */ public static String decorator(String content) { return mdt.decorator(content, false, defaultDecoratorCallback); } /** * ???replaceStr????? * * @param content * @param replaceStr ?null * @return */ public static String decorator(String content, final String replaceStr) { return mdt.decorator(content, false, new DecoratorCallback() { public StringBuilder decorator(String src) { return new StringBuilder(replaceStr); } }); } /** * ???? * * @param content * @return */ public static boolean containTerm(String content) { return mdt.containTerm(content, false); } /** * ? * * @param han * @return ?null */ public static String[] getPinyingOfHan(String han) { return PinyinUtil.getPinyingOfHan(han); } /** * HTMLParserhtmltag? * * @param html html * @return ?? */ public static String parserExtractText(String html) { return pet.getText(html); } /** * HTMLParserhtmltag? * * @param html html * @param ignoreCase ???? * @return ?? */ public static String parserExtractText(String html, boolean ignoreCase) { return pet.getText(html, ignoreCase); } /** * ? */ public static boolean isHanLetter(char ch) { return ch >= PinyinUtil.CJK_UNIFIED_IDEOGRAPHS_START && ch < PinyinUtil.CJK_UNIFIED_IDEOGRAPHS_END; } /** * <PRE> * ?,?? ?? <br> * 1??? <br> * 2???<br> * 3???<br> * 4???<br> * 5?"\n""\r"" ""\t"?<br> * 6????<br> * * </PRE> * * @param src ? * @param needT2S ?? * @param needDBC ?? * @param ignoreCase ? * @param filterNoneHanLetter ? * @param filterSymbol ?symbol"\n""\r"" ""\t",?Symbol_CT.txt * @param keepLastSymbol symbol?? * @return ????? */ public static String compositeTextConvert(String src, boolean needT2S, boolean needDBC, boolean ignoreCase, boolean filterNoneHanLetter, boolean convertSynonymy, boolean filterSymbol, boolean keepLastSymbol) { return CharNormalization.compositeTextConvert(src, needT2S, needDBC, ignoreCase, filterNoneHanLetter, convertSynonymy, filterSymbol, keepLastSymbol); } /** * ???? * * @author hanjie 2011-7-21 ?01:57:30 */ private static class DefaultDecoratorCallback implements DecoratorCallback { public StringBuilder decorator(String src) { return new StringBuilder(DEFAULT_REPLACE_STR); } } /** * xss? * * @param str * @return */ public static String escape(String str) { return HtmlFastEntities.HTML40.escape(str); } /** * ? * * @param charArgs * @return */ public static String[] getCombination(char[][] charArgs) { // int row = 1; // char[][] int col = charArgs.length; // ??? for (int i = 0; i < charArgs.length; i++) { row = row * charArgs[i].length; } // int zhengti = 1; char[][] out = new char[row][col]; for (int i = 0; i < col; i++) { char[] chs = charArgs[i]; // int length = chs.length; // ? int dange = row / length / zhengti; int start = 0; for (int j = 0; j < zhengti; j++) { for (int k = 0; k < length; k++) { char c = chs[k]; for (int l = 0; l < dange; l++) { out[start][i] = c; start++; } } } // * zhengti = zhengti * length; } String[] ss = new String[row]; for (int i = 0; i < row; i++) { StringBuilder sb = new StringBuilder(); for (int j = 0; j < col; j++) { sb.append(out[i][j]); } ss[i] = sb.toString(); } return ss; } }