Java tutorial
/* * Copyright 2011-2016 ZuoBian.com All right reserved. This software is the confidential and proprietary information of * ZuoBian.com ("Confidential Information"). You shall not disclose such Confidential Information and shall use it only * in accordance with the terms of the license agreement you entered into with ZuoBian.com. */ package com.zb.jcseg.util; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import org.apache.commons.lang.CharUtils; import org.apache.commons.lang.StringUtils; /** * ?? * * @author zxc Sep 3, 2014 2:11:34 PM */ public class WordUnionUtils { // ?? static String[] SUFFIX_LETTERS = new String[] { "", "", "", "", "", "", "", "?", "", "?", "", "", "", "", "?", "", "?", "", "", "", "", "", "", "", "", "", "", "", "", "?" }; // ?? static String[] PREFIX_LETTERS = new String[] { "", "?", "", "", /* "", "", */"", "", /* * "", * "?" * , */ "?", "", "", "", "", "", /* "", */"", "", "", "", "", "", "?", "?", "?", "", "?", "" }; // static String[] Single_Words = new String[] { "", "", "", "?", "", "", "", "", "", // "", "", "", "?", "", "", "", "", "", "?", "", "", "", "", "", "", "", "", "", "", "", // "", "", "", "", "", "", "", "?", "", "", "", "", "", "?" // static String[] Single_Words = new String[] { "", "", "", "", "", "", "", "", "", "?", "", "", "", "", "", "", "?", "", "", "", "", "", "", "", "", "?", "", "", "", "", "", "", "", "?", "", "", "", "?", "", "", "", "", "", "", "", "", "", "", "", "", "?", "?", "", "", "", "", "", "", "", "?", "", "", "", "", "", "", "?", "", "", "", "", "", "", "", "", "", "?", "", "", "", "", "", "", "", "", "", "" // /* "", "", "", "", "?", "?", "", "", "", "", "?" */ // ??? }; // ??? static List<AbbreviationWord> abreviationWordList = new ArrayList<AbbreviationWord>(); static { abreviationWordList.add(new AbbreviationWord("", new String[] { "", "" }, new String[] { "" })); } // ?? static HashSet<String> unitSet = new HashSet<String>(); static { String[] units = { "", "", "", "", "", "", "", "", "", "", "", "", "?", "", "", "", "", "", "?", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "?", "", "?", "", "", "", "", "", "", "", "", "?", "?", "", "", "?", "?", "?", "?", "", "?", "", "", "", "", "", "", "", "", "", "", "", "", "", /* "","", */"", /* "", */"", "", "", "", "?", "", /* * "", */ "", "", "", "", "", "", "?", "", "", "", "?", "", "", "", "", "", "", "?", "?", "?", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "?", "", "?", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "?", "", "?", "", "", "", "", "", "?", "", "", "", "?" }; for (String c : units) { unitSet.add(c); } } /** * ???? * * @param segWords * @return */ @SuppressWarnings("unused") public static List<String> wiselyCombineWords(List<String> segWords) { if (segWords.isEmpty()) { return segWords; } List<String> splitWordsList = new ArrayList<String>(segWords.size()); StringBuilder sb = new StringBuilder(); String candidate = ""; String space = " "; // ?--???? for (int i = 0, size = segWords.size(); i < size; i++) { String word = segWords.get(i); // } // for (String word : segWords) { // ??1 if (StringUtils.length(word) > 1) { // if (sb.length() > 0) { // ??? String tmpStr = sb.toString(); if (sb.length() == 1) { // ??? if (isSingleWord(tmpStr)) { addStr2List(splitWordsList, candidate); addStr2List(splitWordsList, tmpStr); } // ???? else if (isPrefixLetter(tmpStr)) { if (StringUtils.isNotBlank(candidate)) { addStr2List(splitWordsList, candidate); } // ? word = tmpStr + word; } else { if (isEndWithDigit(candidate)) { // ?? if (unitSet.contains(tmpStr) || StringUtils.length(word) > 1) { candidate = candidate + tmpStr; addStr2List(splitWordsList, candidate); } else { if (StringUtils.isNotBlank(candidate)) { addStr2List(splitWordsList, candidate); } // ?? word = tmpStr + word; } } else { handleSingleWord(splitWordsList, candidate, tmpStr); } } } else { addStr2List(splitWordsList, candidate); addStr2List(splitWordsList, tmpStr); } sb.setLength(0); } else { if (StringUtils.isNotBlank(candidate)) { addStr2List(splitWordsList, candidate); } } candidate = word; } else { { int k = i + 1; // ??? int singleWordCount = 1; for (int j = k; j < size; j++) { if (StringUtils.length(segWords.get(j)) != 1) { break; } singleWordCount++; } if (singleWordCount > 1) { int exIndex = i; boolean canAllBeTreatedAsSingle = true; for (int j = i, stop = i + singleWordCount; j < stop; j++) { if (!isSingleWord(segWords.get(j))) { exIndex = j; canAllBeTreatedAsSingle = false; break; } } // ??? if (canAllBeTreatedAsSingle) { addStr2List(splitWordsList, candidate); candidate = ""; for (int stop = i + singleWordCount; i < stop; i++) { addStr2List(splitWordsList, segWords.get(i)); } i--; } else { // TODO zxc:? for (int stop = i + singleWordCount; i < stop; i++) { sb.append(segWords.get(i)); } i--; addStr2List(splitWordsList, candidate); addStr2List(splitWordsList, sb.toString()); sb.setLength(0); candidate = ""; } continue; } } if (StringUtils.equals(word, space)) { String tmpStr = sb.toString(); if (sb.length() == 1) { candidate = candidate + tmpStr; addStr2List(splitWordsList, candidate); } else { if (StringUtils.isNotBlank(candidate)) { addStr2List(splitWordsList, candidate); } if (sb.length() > 1) { addStr2List(splitWordsList, tmpStr); } } splitWordsList.add(space); sb.setLength(0); candidate = ""; } else { // 2013 // if (index + 1 < size) { // nextWordIsSingle = StringUtils.length(segWords.get(index + 1)) == 1; // } else { // nextWordIsSingle = false; // } if (sb.length() > 0) { String header = sb.substring(0, 1); // ? if (isSingleWord(header)) { addStr2List(splitWordsList, candidate); candidate = ""; addStr2List(splitWordsList, header); sb.deleteCharAt(0); } } sb.append(word); } } } boolean isCandidateNotNull = StringUtils.length(candidate) > 0; boolean isSbNotNull = sb.length() > 0; if (isCandidateNotNull) { if (isSbNotNull) { String w = sb.toString(); if (isSingleWord(w)) { addStr2List(splitWordsList, candidate); addStr2List(splitWordsList, w); } else { handleSingleWord(splitWordsList, candidate, w); } } else { addStr2List(splitWordsList, candidate); } } else if (isSbNotNull) { addStr2List(splitWordsList, sb.toString()); } return splitWordsList; } private static void handleSingleWord(List<String> splitWordsList, String candidate, String singleWord) { MagicWordResult result = isNeedAddDualWord(candidate, singleWord); if (result.isSuccess()) { addStr2List(splitWordsList, candidate); addStr2List(splitWordsList, result.getWord()); } else { if (StringUtils.length(singleWord) <= 1) { // TODO zxc???? addStr2List(splitWordsList, candidate); addStr2List(splitWordsList, singleWord); // candidate = candidate + singleWord; // addStr2List(splitWordsList, candidate); } else { addStr2List(splitWordsList, candidate); addStr2List(splitWordsList, singleWord); } } } /** * ? * * @param splitWordsList * @param candidate */ public static void addStr2List(List<String> splitWordsList, String candidate) { if (StringUtils.isBlank(candidate)) { return; } // ???? // for (String word : splitWordsList) { // if (word.contains(candidate)) { // return; // } // } splitWordsList.add(candidate); } /** * ?????? * * @param c * @return */ private static boolean isPrefixLetter(String c) { for (String word : PREFIX_LETTERS) { if (StringUtils.equals(word, c)) { return true; } } return false; } /** * ?? -- M7 * * @param c * @return */ public static boolean isEndWithAsciiAlphanumeric(String str) { if (StringUtils.isEmpty(str)) { return false; } return CharUtils.isAsciiAlphanumeric(str.charAt(str.length() - 1)); } private static boolean isEndWithDigit(String str) { if (StringUtils.isEmpty(str)) { return false; } return Character.isDigit(str.charAt(str.length() - 1)); } public static boolean isStartWithDigit(String str) { if (StringUtils.isEmpty(str)) { return false; } return Character.isDigit(str.charAt(0)); } public static boolean isEndWithDigital(String str, String c) { if (StringUtils.isEmpty(str)) { return false; } return CharUtils.isAsciiNumeric(str.charAt(str.length() - 1)) && unitSet.contains(c); } public static boolean isSuffix(String c) { for (String word : SUFFIX_LETTERS) { if (StringUtils.equals(word, c)) { return true; } } return false; } /** * ???? * * @param c * @return */ private static boolean isSingleWord(String c) { for (String word : Single_Words) { if (StringUtils.equals(word, c)) { return true; } } return false; } // ?? public static boolean isContainSingleWord(String w) { if (StringUtils.length(w) <= 0) { return false; } for (String s : Single_Words) { if (StringUtils.contains(w, s)) { return true; } } return false; } // ??? public static MagicWordResult isNeedAddDualWord(String candidate, String lastWord) { for (AbbreviationWord word : abreviationWordList) { MagicWordResult result = word.isAccept(candidate, lastWord); if (result.isSuccess()) { return result; } } return MagicWordResult.failResult; } static class AbbreviationWord { private String dualWord; private String[] prefixes; private String[] suffixes; public AbbreviationWord(String dualWord, String[] prefixes, String[] suffixes) { this.dualWord = dualWord; this.prefixes = prefixes; this.suffixes = suffixes; } public MagicWordResult isAccept(String candidate, String lastWord) { if (!StringUtils.endsWith(candidate, dualWord)) { return MagicWordResult.failResult; } String endWord = null; boolean isFind = false; for (String s : suffixes) { if (StringUtils.endsWith(lastWord, s)) { isFind = true; endWord = s; break; } } if (!isFind) { return MagicWordResult.failResult; } String tmp = candidate.substring(0, candidate.length() - 1); for (String s : prefixes) { if (StringUtils.endsWith(tmp, s)) { return new MagicWordResult(dualWord + endWord, true); } } return MagicWordResult.failResult; } } static class MagicWordResult { public static final MagicWordResult failResult = new MagicWordResult(null, false); MagicWordResult(String word, boolean success) { this.word = word; this.success = success; } private boolean success; private String word; public boolean isSuccess() { return success; } public void setSuccess(boolean success) { this.success = success; } public String getWord() { return word; } public void setWord(String word) { this.word = word; } } }