com.zb.jcseg.util.WordUnionUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.zb.jcseg.util.WordUnionUtils.java

Source

/*
 * Copyright 2011-2016 ZuoBian.com All right reserved. This software is the confidential and proprietary information of
 * ZuoBian.com ("Confidential Information"). You shall not disclose such Confidential Information and shall use it only
 * in accordance with the terms of the license agreement you entered into with ZuoBian.com.
 */
package com.zb.jcseg.util;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;

import org.apache.commons.lang.CharUtils;
import org.apache.commons.lang.StringUtils;

/**
 * ??
 * 
 * @author zxc Sep 3, 2014 2:11:34 PM
 */
public class WordUnionUtils {

    // ??
    static String[] SUFFIX_LETTERS = new String[] { "", "", "", "", "", "", "", "?", "",
            "?", "", "", "", "", "?", "", "?", "", "", "", "", "", "", "",
            "", "", "", "", "", "?" };

    // ??
    static String[] PREFIX_LETTERS = new String[] { "", "?", "", "", /* "", "", */"", "", /*
                                                                                                                 * "",
                                                                                                                 * "?"
                                                                                                                 * ,
                                                                                                                 */
            "?", "", "", "", "", "", /* "", */"", "", "", "", "", "", "?",
            "?", "?", "", "?", "" };

    // static String[] Single_Words = new String[] { "", "", "", "?", "", "", "", "", "",
    // "", "", "", "?", "", "", "", "", "", "?", "", "", "", "", "", "", "", "", "", "", "",
    // "", "", "", "", "", "", "", "?", "", "", "", "", "", "?"
    //
    static String[] Single_Words = new String[] { "", "", "", "", "", "", "", "", "",
            "?", "", "", "", "", "", "", "?", "", "", "", "", "", "", "",
            "", "?", "", "", "", "", "", "", "", "?", "", "", "", "?", "",
            "", "", "", "", "", "", "", "", "", "", "", "?", "?", "", "",
            "", "", "", "", "", "?", "", "", "", "", "", "", "?", "", "",
            "", "", "", "", "", "", "", "?", "", "", "", "", "", "", "",
            "", "", ""
            // 
            /* "", "", "", "", "?", "?", "", "", "", "", "?" */

            // ???

    };

    //  ???
    static List<AbbreviationWord> abreviationWordList = new ArrayList<AbbreviationWord>();
    static {
        abreviationWordList.add(new AbbreviationWord("", new String[] { "", "" }, new String[] { "" }));
    }

    // ??
    static HashSet<String> unitSet = new HashSet<String>();
    static {
        String[] units = { "", "", "", "", "", "", "", "", "", "", "", "",
                "?", "", "", "", "", "", "?", "", "", "", "", "", "", "",
                "", "", "", "", "", "", "", "", "", "?", "", "?", "", "",
                "", "", "", "", "", "", "?", "?", "", "", "?", "?", "?", "?",
                "", "?", "", "", "", "", "", "", "", "", "", "", "", "",
                "", /* "","", */"", /* "", */"", "", "", "", "?", "", /*
                                                                                                      * "",
                                                                                                      */
                "", "", "", "", "", "", "?", "", "", "", "?", "", "", "",
                "", "", "", "?", "?", "?", "", "", "", "", "", "", "", "",
                "", "", "", "", "", "", "?", "", "?", "", "", "", "", "",
                "", "", "", "", "", "", "", "", "", "", "", "", "", "",
                "?", "", "?", "", "", "", "", "", "?", "", "", "", "?" };
        for (String c : units) {
            unitSet.add(c);
        }
    }

    /**
     * ????
     * 
     * @param segWords
     * @return
     */
    @SuppressWarnings("unused")
    public static List<String> wiselyCombineWords(List<String> segWords) {
        if (segWords.isEmpty()) {
            return segWords;
        }
        List<String> splitWordsList = new ArrayList<String>(segWords.size());
        StringBuilder sb = new StringBuilder();
        String candidate = "";
        String space = " ";
        // ?--????
        for (int i = 0, size = segWords.size(); i < size; i++) {
            String word = segWords.get(i);
            // }
            // for (String word : segWords) {
            // ??1
            if (StringUtils.length(word) > 1) {
                // 
                if (sb.length() > 0) {
                    // ???
                    String tmpStr = sb.toString();
                    if (sb.length() == 1) {
                        // ???
                        if (isSingleWord(tmpStr)) {
                            addStr2List(splitWordsList, candidate);
                            addStr2List(splitWordsList, tmpStr);
                        }
                        // ????
                        else if (isPrefixLetter(tmpStr)) {
                            if (StringUtils.isNotBlank(candidate)) {
                                addStr2List(splitWordsList, candidate);
                            }
                            // ?
                            word = tmpStr + word;
                        } else {
                            if (isEndWithDigit(candidate)) {
                                // ??
                                if (unitSet.contains(tmpStr) || StringUtils.length(word) > 1) {
                                    candidate = candidate + tmpStr;
                                    addStr2List(splitWordsList, candidate);
                                } else {
                                    if (StringUtils.isNotBlank(candidate)) {
                                        addStr2List(splitWordsList, candidate);
                                    }
                                    // ??
                                    word = tmpStr + word;
                                }
                            } else {
                                handleSingleWord(splitWordsList, candidate, tmpStr);
                            }
                        }
                    } else {
                        addStr2List(splitWordsList, candidate);
                        addStr2List(splitWordsList, tmpStr);
                    }
                    sb.setLength(0);
                } else {
                    if (StringUtils.isNotBlank(candidate)) {
                        addStr2List(splitWordsList, candidate);
                    }
                }
                candidate = word;
            } else {
                {
                    int k = i + 1;
                    // ???
                    int singleWordCount = 1;
                    for (int j = k; j < size; j++) {
                        if (StringUtils.length(segWords.get(j)) != 1) {
                            break;
                        }
                        singleWordCount++;
                    }
                    if (singleWordCount > 1) {
                        int exIndex = i;
                        boolean canAllBeTreatedAsSingle = true;
                        for (int j = i, stop = i + singleWordCount; j < stop; j++) {
                            if (!isSingleWord(segWords.get(j))) {
                                exIndex = j;
                                canAllBeTreatedAsSingle = false;
                                break;
                            }
                        }
                        // ???
                        if (canAllBeTreatedAsSingle) {
                            addStr2List(splitWordsList, candidate);
                            candidate = "";
                            for (int stop = i + singleWordCount; i < stop; i++) {
                                addStr2List(splitWordsList, segWords.get(i));
                            }
                            i--;
                        } else {
                            // TODO zxc:?
                            for (int stop = i + singleWordCount; i < stop; i++) {
                                sb.append(segWords.get(i));
                            }
                            i--;
                            addStr2List(splitWordsList, candidate);
                            addStr2List(splitWordsList, sb.toString());
                            sb.setLength(0);
                            candidate = "";
                        }
                        continue;
                    }
                }

                if (StringUtils.equals(word, space)) {
                    String tmpStr = sb.toString();
                    if (sb.length() == 1) {
                        candidate = candidate + tmpStr;
                        addStr2List(splitWordsList, candidate);
                    } else {
                        if (StringUtils.isNotBlank(candidate)) {
                            addStr2List(splitWordsList, candidate);
                        }
                        if (sb.length() > 1) {
                            addStr2List(splitWordsList, tmpStr);
                        }
                    }
                    splitWordsList.add(space);
                    sb.setLength(0);
                    candidate = "";
                } else {
                    // 2013
                    // if (index + 1 < size) {
                    // nextWordIsSingle = StringUtils.length(segWords.get(index + 1)) == 1;
                    // } else {
                    // nextWordIsSingle = false;
                    // }
                    if (sb.length() > 0) {
                        String header = sb.substring(0, 1);
                        // ?
                        if (isSingleWord(header)) {
                            addStr2List(splitWordsList, candidate);
                            candidate = "";
                            addStr2List(splitWordsList, header);
                            sb.deleteCharAt(0);
                        }
                    }
                    sb.append(word);
                }
            }
        }
        boolean isCandidateNotNull = StringUtils.length(candidate) > 0;
        boolean isSbNotNull = sb.length() > 0;
        if (isCandidateNotNull) {
            if (isSbNotNull) {
                String w = sb.toString();
                if (isSingleWord(w)) {
                    addStr2List(splitWordsList, candidate);
                    addStr2List(splitWordsList, w);
                } else {
                    handleSingleWord(splitWordsList, candidate, w);
                }
            } else {
                addStr2List(splitWordsList, candidate);
            }
        } else if (isSbNotNull) {
            addStr2List(splitWordsList, sb.toString());
        }
        return splitWordsList;
    }

    private static void handleSingleWord(List<String> splitWordsList, String candidate, String singleWord) {
        MagicWordResult result = isNeedAddDualWord(candidate, singleWord);
        if (result.isSuccess()) {
            addStr2List(splitWordsList, candidate);
            addStr2List(splitWordsList, result.getWord());
        } else {
            if (StringUtils.length(singleWord) <= 1) {
                // TODO zxc????
                addStr2List(splitWordsList, candidate);
                addStr2List(splitWordsList, singleWord);
                // candidate = candidate + singleWord;
                // addStr2List(splitWordsList, candidate);
            } else {
                addStr2List(splitWordsList, candidate);
                addStr2List(splitWordsList, singleWord);
            }
        }
    }

    /**
     * ?
     * 
     * @param splitWordsList
     * @param candidate
     */
    public static void addStr2List(List<String> splitWordsList, String candidate) {
        if (StringUtils.isBlank(candidate)) {
            return;
        }
        // ????
        // for (String word : splitWordsList) {
        // if (word.contains(candidate)) {
        // return;
        // }
        // }
        splitWordsList.add(candidate);
    }

    /**
     * ??????
     * 
     * @param c
     * @return
     */
    private static boolean isPrefixLetter(String c) {
        for (String word : PREFIX_LETTERS) {
            if (StringUtils.equals(word, c)) {
                return true;
            }
        }
        return false;
    }

    /**
     * ?? -- M7
     * 
     * @param c
     * @return
     */
    public static boolean isEndWithAsciiAlphanumeric(String str) {
        if (StringUtils.isEmpty(str)) {
            return false;
        }
        return CharUtils.isAsciiAlphanumeric(str.charAt(str.length() - 1));
    }

    private static boolean isEndWithDigit(String str) {
        if (StringUtils.isEmpty(str)) {
            return false;
        }
        return Character.isDigit(str.charAt(str.length() - 1));
    }

    public static boolean isStartWithDigit(String str) {
        if (StringUtils.isEmpty(str)) {
            return false;
        }
        return Character.isDigit(str.charAt(0));
    }

    public static boolean isEndWithDigital(String str, String c) {
        if (StringUtils.isEmpty(str)) {
            return false;
        }
        return CharUtils.isAsciiNumeric(str.charAt(str.length() - 1)) && unitSet.contains(c);
    }

    public static boolean isSuffix(String c) {
        for (String word : SUFFIX_LETTERS) {
            if (StringUtils.equals(word, c)) {
                return true;
            }
        }
        return false;
    }

    /**
     * ????
     * 
     * @param c
     * @return
     */
    private static boolean isSingleWord(String c) {
        for (String word : Single_Words) {
            if (StringUtils.equals(word, c)) {
                return true;
            }
        }
        return false;
    }

    // ??
    public static boolean isContainSingleWord(String w) {
        if (StringUtils.length(w) <= 0) {
            return false;
        }
        for (String s : Single_Words) {
            if (StringUtils.contains(w, s)) {
                return true;
            }
        }
        return false;
    }

    // ???
    public static MagicWordResult isNeedAddDualWord(String candidate, String lastWord) {
        for (AbbreviationWord word : abreviationWordList) {
            MagicWordResult result = word.isAccept(candidate, lastWord);
            if (result.isSuccess()) {
                return result;
            }
        }
        return MagicWordResult.failResult;
    }

    static class AbbreviationWord {

        private String dualWord;
        private String[] prefixes;
        private String[] suffixes;

        public AbbreviationWord(String dualWord, String[] prefixes, String[] suffixes) {
            this.dualWord = dualWord;
            this.prefixes = prefixes;
            this.suffixes = suffixes;
        }

        public MagicWordResult isAccept(String candidate, String lastWord) {
            if (!StringUtils.endsWith(candidate, dualWord)) {
                return MagicWordResult.failResult;
            }
            String endWord = null;
            boolean isFind = false;
            for (String s : suffixes) {
                if (StringUtils.endsWith(lastWord, s)) {
                    isFind = true;
                    endWord = s;
                    break;
                }
            }
            if (!isFind) {
                return MagicWordResult.failResult;
            }
            String tmp = candidate.substring(0, candidate.length() - 1);
            for (String s : prefixes) {
                if (StringUtils.endsWith(tmp, s)) {
                    return new MagicWordResult(dualWord + endWord, true);
                }
            }

            return MagicWordResult.failResult;
        }
    }

    static class MagicWordResult {

        public static final MagicWordResult failResult = new MagicWordResult(null, false);

        MagicWordResult(String word, boolean success) {
            this.word = word;
            this.success = success;
        }

        private boolean success;
        private String word;

        public boolean isSuccess() {
            return success;
        }

        public void setSuccess(boolean success) {
            this.success = success;
        }

        public String getWord() {
            return word;
        }

        public void setWord(String word) {
            this.word = word;
        }
    }
}