com.ms.commons.fasttext.FasttextService.java Source code

Java tutorial

Introduction

Here is the source code for com.ms.commons.fasttext.FasttextService.java

Source

/*
 * Copyright 2011-2016 ZXC.com All right reserved. This software is the confidential and proprietary information of
 * ZXC.com ("Confidential Information"). You shall not disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license agreement you entered into with ZXC.com.
 */
package com.ms.commons.fasttext;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.ms.commons.fasttext.codec.HtmlFastEntities;
import com.ms.commons.fasttext.decorator.DecoratorCallback;
import com.ms.commons.fasttext.extract.CharNormalization;
import com.ms.commons.fasttext.psoriasis.HTMLParserExtractText;
import com.ms.commons.fasttext.psoriasis.MappedDecoratorText;
import com.ms.commons.fasttext.psoriasis.PsoriasisUtil;
import com.ms.commons.fasttext.psoriasis.SkipTermExtraInfo;
import com.ms.commons.fasttext.psoriasis.WordTransformer;
import com.ms.commons.fasttext.text.PinyinUtil;

/**
 * ??????<br>
 * <br>
 * html<br>
 * 
 * @author zxc Apr 12, 2013 3:25:32 PM
 */
public class FasttextService {

    private static Log logger = LogFactory.getLog(FasttextService.class);

    // Systemkey
    public static final String BANNED_DIR = "banneddir";
    // ???
    public static final String BANNEDWORD = "BANNEDWORD_U8.TXT";
    // ??
    public static final String HOMOPHONE = "HOMOPHONE_U8.TXT";
    // ?
    public static final String RADICALDIC = "RADICALDIC_U8.TXT";

    public static final String DEFAULT_REPLACE_STR = "***";

    //
    private static MappedDecoratorText mdt;
    // ???
    private static DefaultDecoratorCallback defaultDecoratorCallback;
    //
    private static HTMLParserExtractText pet = new HTMLParserExtractText();

    private static final int DELAY_TIME = 10 * 60;

    private static long weijinLastModified;
    private static long forkLastModified;
    private static long homoLastModified;

    static {
        // System.setProperty(BANNED_DIR, "/Users/zxc/msun/");
        loadText();
        createThreadPool();
    }

    private static void createThreadPool() {
        Executors.newSingleThreadScheduledExecutor().scheduleWithFixedDelay(new Runnable() {

            public void run() {
                reload();
            }
        }, DELAY_TIME, DELAY_TIME, TimeUnit.SECONDS);
    }

    private static void reload() {
        logger.info("????...");
        if (!checkFile(false)) {
            logger.info("????,?");
            return;
        }
        String path = System.getProperty(BANNED_DIR);
        String weijinfilename = path + File.separator + BANNEDWORD;
        String forkfilename = path + File.separator + RADICALDIC;
        String homofilename = path + File.separator + HOMOPHONE;
        File wf = new File(weijinfilename);
        File ff = new File(forkfilename);
        File hf = new File(homofilename);
        if (weijinLastModified != wf.lastModified() || forkLastModified != ff.lastModified()
                || homoLastModified != hf.lastModified()) {
            // ?,?
            logger.info("??,????....?...");
            loadText();
            logger.info("????....?...");
        } else {
            logger.info("?,?...");
        }
    }

    private static void loadText() {
        String path = System.getProperty(BANNED_DIR);
        List<String> pinying;
        List<String> weijin;
        List<String> fork;
        List<String> homo;
        // path = "/Users/hanjie/Documents/bannedword";
        pinying = PinyinUtil.loadListFromFile("data", "PINYING_U8.TXT", "utf8");
        if (!checkFile(true)) {
            // 
            weijin = loadListFromFile("data", BANNEDWORD, "utf8");
            fork = loadListFromFile("data", RADICALDIC, "utf8");
            homo = loadListFromFile("data", HOMOPHONE, "utf8");
        } else {
            String filename = path + File.separator + BANNEDWORD;
            weijin = readList(filename, "utf8");
            filename = path + File.separator + RADICALDIC;
            fork = readList(filename, "utf8");
            filename = path + File.separator + HOMOPHONE;
            homo = readList(filename, "utf8");
        }

        WordTransformer transform = new WordTransformer(pinying, fork, null, homo);
        List<SkipTermExtraInfo> radList = new ArrayList<SkipTermExtraInfo>();
        List<SkipTermExtraInfo> weijinList = PsoriasisUtil.loadDic(weijin);
        for (SkipTermExtraInfo skipTermExtraInfo : weijinList) {
            radList.add(skipTermExtraInfo);
            // +??? ???
            radList.addAll(transform.transformPinyinWords(skipTermExtraInfo.getWord()));
            // ?+??? ????  
            radList.addAll(transform.transformForkWords(skipTermExtraInfo.getWord()));
            // ??+??? ?????
            List<SkipTermExtraInfo> hm = transform.transformHomophoneWords(skipTermExtraInfo.getWord());
            radList.addAll(hm);

            // ??++???
            for (SkipTermExtraInfo skipTermExtraInfo2 : hm) {
                radList.addAll(transform.transformPinyinWords(skipTermExtraInfo2.getWord()));
            }
        }
        Properties props = new Properties();
        mdt = new MappedDecoratorText(radList, props);
        defaultDecoratorCallback = new DefaultDecoratorCallback();
    }

    private static boolean checkFile(boolean isInit) {
        String path = System.getProperty(BANNED_DIR);
        if (path == null || path.trim().length() == 0) {
            return false;
        }
        String weijinfilename = path + File.separator + BANNEDWORD;
        String forkfilename = path + File.separator + RADICALDIC;
        String homofilename = path + File.separator + HOMOPHONE;
        File wf = new File(weijinfilename);
        File ff = new File(forkfilename);
        File hf = new File(homofilename);
        boolean flag = wf.exists() && ff.exists() && hf.exists();
        if (flag && isInit) {
            weijinLastModified = wf.lastModified();
            forkLastModified = ff.lastModified();
            homoLastModified = hf.lastModified();
        }
        return flag;
    }

    private static List<String> readList(String resource, String encoding) {
        if (resource == null) {
            return null;
        }
        List<String> list = new ArrayList<String>();
        BufferedReader br = null;
        if (encoding == null || encoding.equals("")) {
            encoding = "UTF-8";
        }
        try {
            br = new BufferedReader(new InputStreamReader(new FileInputStream(resource), encoding));
            String line = null;
            while ((line = br.readLine()) != null) {
                list.add(line.trim());
            }
        } catch (UnsupportedEncodingException e) {
            logger.error("read " + resource + "fail...", e);
            throw new RuntimeException(e);
        } catch (IOException e) {
            logger.error("read " + resource + "fail...", e);
            throw new RuntimeException(e);
        } finally {
            if (br != null) {
                try {
                    br.close();
                } catch (IOException e) {
                }
            }
        }
        return list;
    }

    private static final List<String> loadListFromFile(String subfold, String file, String encoding) {
        String pckName = FasttextService.class.getPackage().getName();
        file = "/" + pckName.replace('.', '/') + "/" + subfold + "/" + file;
        InputStream istream = FasttextService.class.getResourceAsStream(file);
        if (istream == null) {
            throw new RuntimeException("Could not find file: " + file);
        }
        List<String> ret = new ArrayList<String>();
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(istream, encoding));
            String line = reader.readLine();
            while (line != null) {
                if (line.trim().length() > 0) {
                    ret.add(line);
                }
                line = reader.readLine();
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        } finally {
            try {
                if (reader != null) {
                    reader.close();
                } else {
                    if (istream != null) {
                        istream.close();
                    }
                }
            } catch (Exception e2) {
                throw new RuntimeException("close stream failed", e2);
            }
        }
        return ret;
    }

    /**
     * ???***?????
     * 
     * @param content
     * @return
     */
    public static String decorator(String content) {
        return mdt.decorator(content, false, defaultDecoratorCallback);
    }

    /**
     * ???replaceStr?????
     * 
     * @param content
     * @param replaceStr ?null
     * @return
     */
    public static String decorator(String content, final String replaceStr) {
        return mdt.decorator(content, false, new DecoratorCallback() {

            public StringBuilder decorator(String src) {
                return new StringBuilder(replaceStr);
            }
        });
    }

    /**
     * ????
     * 
     * @param content
     * @return
     */
    public static boolean containTerm(String content) {
        return mdt.containTerm(content, false);
    }

    /**
     * ?
     * 
     * @param han 
     * @return ?null
     */
    public static String[] getPinyingOfHan(String han) {
        return PinyinUtil.getPinyingOfHan(han);
    }

    /**
     * HTMLParserhtmltag?
     * 
     * @param html html
     * @return ??
     */
    public static String parserExtractText(String html) {
        return pet.getText(html);
    }

    /**
     * HTMLParserhtmltag?
     * 
     * @param html html
     * @param ignoreCase ????
     * @return ??
     */
    public static String parserExtractText(String html, boolean ignoreCase) {
        return pet.getText(html, ignoreCase);
    }

    /**
     * ?
     */
    public static boolean isHanLetter(char ch) {
        return ch >= PinyinUtil.CJK_UNIFIED_IDEOGRAPHS_START && ch < PinyinUtil.CJK_UNIFIED_IDEOGRAPHS_END;
    }

    /**
     * <PRE>
     * ?,?? ?? &lt;br&gt;
     * 1??? &lt;br&gt;
     * 2???&lt;br&gt;
     * 3???&lt;br&gt;
     * 4???&lt;br&gt;
     * 5?&quot;\n&quot;&quot;\r&quot;&quot; &quot;&quot;\t&quot;?&lt;br&gt;
     * 6????&lt;br&gt;
     * 
     * </PRE>
     * 
     * @param src ?
     * @param needT2S ??
     * @param needDBC ??
     * @param ignoreCase ?
     * @param filterNoneHanLetter ?
     * @param filterSymbol ?symbol"\n""\r"" ""\t",?Symbol_CT.txt
     * @param keepLastSymbol symbol??
     * @return ?????
     */
    public static String compositeTextConvert(String src, boolean needT2S, boolean needDBC, boolean ignoreCase,
            boolean filterNoneHanLetter, boolean convertSynonymy, boolean filterSymbol, boolean keepLastSymbol) {
        return CharNormalization.compositeTextConvert(src, needT2S, needDBC, ignoreCase, filterNoneHanLetter,
                convertSynonymy, filterSymbol, keepLastSymbol);
    }

    /**
     * ????
     * 
     * @author hanjie 2011-7-21 ?01:57:30
     */
    private static class DefaultDecoratorCallback implements DecoratorCallback {

        public StringBuilder decorator(String src) {
            return new StringBuilder(DEFAULT_REPLACE_STR);
        }
    }

    /**
     * xss?
     * 
     * @param str
     * @return
     */
    public static String escape(String str) {
        return HtmlFastEntities.HTML40.escape(str);
    }

    /**
     * ?
     * 
     * @param charArgs
     * @return
     */
    public static String[] getCombination(char[][] charArgs) {
        // 
        int row = 1;
        // char[][]
        int col = charArgs.length;
        // ???
        for (int i = 0; i < charArgs.length; i++) {
            row = row * charArgs[i].length;
        }
        // 
        int zhengti = 1;
        char[][] out = new char[row][col];
        for (int i = 0; i < col; i++) {
            char[] chs = charArgs[i];
            // 
            int length = chs.length;
            // ?
            int dange = row / length / zhengti;
            int start = 0;
            for (int j = 0; j < zhengti; j++) {
                for (int k = 0; k < length; k++) {
                    char c = chs[k];
                    for (int l = 0; l < dange; l++) {
                        out[start][i] = c;
                        start++;
                    }
                }
            }
            // *
            zhengti = zhengti * length;
        }
        String[] ss = new String[row];
        for (int i = 0; i < row; i++) {
            StringBuilder sb = new StringBuilder();
            for (int j = 0; j < col; j++) {
                sb.append(out[i][j]);
            }
            ss[i] = sb.toString();
        }
        return ss;
    }

}