Java tutorial
/** * This is free and unencumbered software released into the public domain. * * Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either * in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and * by any means. * * In jurisdictions that recognize copyright laws, the author or authors of this software dedicate * any and all copyright interest in the software to the public domain. We make this dedication for * the benefit of the public at large and to the detriment of our heirs and successors. We intend * this dedication to be an overt act of relinquishment in perpetuity of all present and future * rights to this software under copyright law. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * For more information, please refer to <http://unlicense.org/> */ package com.github.pffy.chinese.freq; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import org.json.JSONException; import org.json.JSONObject; import org.json.JSONTokener; /** * ChineseFrequency.java - Builds Chinese character frequency list. * * @author The Pffy Authors * @version 2.2 * */ public final class ChineseFrequency { private final String CRLF = System.lineSeparator(); private final int PADSIZE_SUMMARY = 25; private final int PADSIZE_FREQ = 5; private final String MSG_FILE_NOT_LOADED = "File not loaded."; private final String MSG_TOTAL_COUNT = "Total Characters"; private final String MSG_REMOVED_COUNT = "Characters Removed"; private final String MSG_HANZI_COUNT = "Hanzi Characters"; private final String MSG_UNIQUE_COUNT = "Unique Hanzi Characters"; private final String MSG_PROCESSED_COUNT = "Characters Processed"; private final String HEADER_ROW_CSV = "hz,py,freq"; private final String HEADER_ROW_TSV = "hz\tpy\tfreq"; private final String HEADER_ROW_TXT = padSummary("hz [py]", this.PADSIZE_SUMMARY) + "freq"; private final String hpjson = "/json/IdxHanyuPinyin.json"; private final String xpjson = "/json/IdxExtraPinyin.json"; // Objects private JSONObject hpdx = new JSONObject(); private JSONObject xpdx = new JSONObject(); // Debug Info private Set<String> unmappedCharacters = new HashSet<String>(); // Counts private int inputCount = 0; private int removedCount = 0; private int hanziCount = 0; private int uniqueHanziCount = 0; private int processedCount = 0; // outputs private String summary = ""; private String csvOutput = ""; private String tsvOutput = ""; private String txtOutput = ""; // inputs private String input = ""; /** * Builds this object. */ public ChineseFrequency() { init(); } /** * Builds this object with an input String. * * @param text input */ public ChineseFrequency(String str) { init(); setInput(str); } /** * Returns the string representation of this object. */ @Override public String toString() { return getSummary(); } /** * Returns the summary of counts based on input text. * * @return a summary of character counts */ public String getSummary() { return this.summary; } /** * Returns the CSV-formatted output of the Chinese character frequency counts. First row displays * the headers "hz,py,freq", followed by more rows of data. * * @return text output */ public String getCsvOutput() { return this.csvOutput; } /** * Returns the TSV-formatted output of the Chinese character frequency counts. First row displays * the headers "hz\tpy\tfreq", followed by more rows of data. * * @return text output */ public String getTsvOutput() { return this.tsvOutput; } /** * Returns padded text output of the Chinese character frequency counts. Rows padded to match * summary. * * @return text output */ public String getTxtOutput() { return this.txtOutput; } /** * Sets the Hanzi text to be analyzed. Then, runs Chinese character frequency analysis. * * @param String text input * @return this object * @throws NullPointerException */ public final ChineseFrequency setInput(String str) throws NullPointerException { if (str == null) { throw new NullPointerException(); } this.input = str; analyze(); return this; } /** * Returns total number of characters in input text. * * @return number of input characters */ public int getInputCount() { return this.inputCount; } /** * Returns total number of characters removed by input text. The characters removed comprise * alphanumeric characters, punctuation, and other symbols that should not be counted. * * @return number of characters removed from input text */ public int getRemovedCount() { return this.removedCount; } /** * Returns total number of Hanzi (Chinese characters) ready for counting. * * @return number of Chinese characters remaining after pre-processing */ public int getHanziCount() { return this.hanziCount; } /** * Returns total number of unique Hanzi that will be counted. * * @return number of unique Chinese characters */ public int getUniqueHanziCount() { return this.uniqueHanziCount; } /** * Returns total number of actual Hanzi processed. Functions as a checksum. <i>Should match number * of unique Hanzi.</i> Otherwise, the character (or other extra data) has not been mapped by the * idx. * * @return number of Chinese characters processed */ public int getProcessedCount() { return this.processedCount; } /** * Returns list of unmapped characters. * * @return a String list of characters that were not processed */ public Set<String> getUnmappedCharacters() { return this.unmappedCharacters; } private void analyze() { int inputCount = 0; int removedCount = 0; int hanziCount = 0; int uniqueHanziCount = 0; int processedCount = 0; int freq = 0; String csvOutput = this.HEADER_ROW_CSV; String tsvOutput = this.HEADER_ROW_TSV; String txtOutput = this.HEADER_ROW_TXT; String csv, tsv, txt; String str, input, pinyin, hanzi; Scanner sc; List<String> hanziList; Map<String, Integer> freqMap; JSONObject hpdx; String[] arr; Set<String> unmappedCharacters; hpdx = this.hpdx; input = this.input; inputCount = input.length(); input = retainHanzi(input); removedCount = inputCount - input.length(); hanziCount = input.length(); sc = new Scanner(input); sc.useDelimiter(""); hanziList = new ArrayList<String>(); freqMap = new HashMap<String, Integer>(); // counts occurrences while (sc.hasNext()) { str = sc.next(); hanziList.add(str); if (freqMap.containsKey(str)) { freqMap.put(str, (Integer) freqMap.get(str).intValue() + 1); } else { freqMap.put(str, 1); } } // done with Scanner sc.close(); uniqueHanziCount = freqMap.keySet().size(); SortedMap<String, String> freqTreeMap = new TreeMap<String, String>(Collections.reverseOrder()); unmappedCharacters = new HashSet<String>(); for (Entry<String, Integer> counts : freqMap.entrySet()) { try { hanzi = counts.getKey(); pinyin = hpdx.getString(hanzi); } catch (JSONException je) { // add this unmapped character to the list unmappedCharacters.add(counts.getKey()); // not idx mapped yet. that's ok. move on. continue; } if (pinyin.isEmpty()) { // if character is unmapped in idx, do not process. continue; } freq = counts.getValue(); freqTreeMap.put(String.format("%" + this.PADSIZE_FREQ + "s", freq).replace(' ', '0') + "-" + hanzi + "-" + pinyin, hanzi + "," + pinyin + "," + freq); processedCount++; } // outputs for (Entry<String, String> outputs : freqTreeMap.entrySet()) { csv = this.CRLF + outputs.getValue(); csvOutput += csv; tsv = csv.replaceAll(",", "\t"); tsvOutput += tsv; arr = csv.split(","); // arr[0] is hanzi. arr[1] is pinyin. arr[2] is freq. txt = padSummary(arr[0] + " [" + arr[1] + "]", this.PADSIZE_SUMMARY + 1) + arr[2]; txtOutput += txt; } // cleanup csvOutput = csvOutput.trim(); tsvOutput = tsvOutput.trim(); txtOutput = txtOutput.trim(); // post-process this.csvOutput = csvOutput; this.tsvOutput = tsvOutput; this.txtOutput = txtOutput; // counts this.inputCount = inputCount; this.removedCount = removedCount; this.hanziCount = hanziCount; this.uniqueHanziCount = uniqueHanziCount; this.processedCount = processedCount; this.unmappedCharacters = unmappedCharacters; // summary String summaryString = ""; summaryString += padSummary(this.MSG_TOTAL_COUNT, this.PADSIZE_SUMMARY) + inputCount; summaryString += this.CRLF + padSummary(this.MSG_REMOVED_COUNT, this.PADSIZE_SUMMARY) + removedCount; summaryString += this.CRLF + padSummary(this.MSG_HANZI_COUNT, this.PADSIZE_SUMMARY) + hanziCount; summaryString += this.CRLF + padSummary(this.MSG_UNIQUE_COUNT, this.PADSIZE_SUMMARY) + uniqueHanziCount; summaryString += this.CRLF + padSummary(this.MSG_PROCESSED_COUNT, this.PADSIZE_SUMMARY) + processedCount; this.summary = summaryString; } // pads text right private String padSummary(String str, int size) { return String.format("%-" + size + "s", str).replace(' ', ' ') + " : "; } // removals non-Hanzi characters. private String retainHanzi(String str) { Iterator<?> keys; String extra; // more elegant punctuation removal solution found here: // http://stackoverflow.com/a/9881012 // removes ASCII letters, numbers, and punctuation str = str.replaceAll("[a-zA-Z0-9]|[\\p{Punct}]", ""); keys = this.xpdx.keys(); // remove all extra characters while (keys.hasNext()) { extra = (String) keys.next(); str = str.replace(extra, ""); } // remove all spaces return str.replaceAll("\\s{1,}", ""); } // startup method private void init() { try { // load idx files this.xpdx = loadIdx(this.xpjson); this.hpdx = loadIdx(this.hpjson); } catch (Exception ex) { System.err.println(this.MSG_FILE_NOT_LOADED + ex.getMessage()); } } // loads JSON idx files into JSONObjects private JSONObject loadIdx(String str) { JSONObject jo; InputStream is; is = getClass().getResourceAsStream(str); jo = new JSONObject(new JSONTokener(is)); return jo; } }