com.github.pffy.chinese.freq.ChineseFrequency.java Source code

Introduction

Here is the source code for com.github.pffy.chinese.freq.ChineseFrequency.java
Source

/**
 * This is free and unencumbered software released into the public domain.
 * 
 * Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either
 * in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and
 * by any means.
 * 
 * In jurisdictions that recognize copyright laws, the author or authors of this software dedicate
 * any and all copyright interest in the software to the public domain. We make this dedication for
 * the benefit of the public at large and to the detriment of our heirs and successors. We intend
 * this dedication to be an overt act of relinquishment in perpetuity of all present and future
 * rights to this software under copyright law.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
 * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 * For more information, please refer to <http://unlicense.org/>
 */

package com.github.pffy.chinese.freq;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONTokener;

/**
 * ChineseFrequency.java - Builds Chinese character frequency list.
 * 
 * @author The Pffy Authors
 * @version 2.2
 * 
 */
public final class ChineseFrequency {

    private final String CRLF = System.lineSeparator();

    private final int PADSIZE_SUMMARY = 25;
    private final int PADSIZE_FREQ = 5;

    private final String MSG_FILE_NOT_LOADED = "File not loaded.";

    private final String MSG_TOTAL_COUNT = "Total Characters";
    private final String MSG_REMOVED_COUNT = "Characters Removed";
    private final String MSG_HANZI_COUNT = "Hanzi Characters";
    private final String MSG_UNIQUE_COUNT = "Unique Hanzi Characters";
    private final String MSG_PROCESSED_COUNT = "Characters Processed";

    private final String HEADER_ROW_CSV = "hz,py,freq";
    private final String HEADER_ROW_TSV = "hz\tpy\tfreq";
    private final String HEADER_ROW_TXT = padSummary("hz [py]", this.PADSIZE_SUMMARY) + "freq";

    private final String hpjson = "/json/IdxHanyuPinyin.json";
    private final String xpjson = "/json/IdxExtraPinyin.json";

    // Objects
    private JSONObject hpdx = new JSONObject();
    private JSONObject xpdx = new JSONObject();

    // Debug Info
    private Set<String> unmappedCharacters = new HashSet<String>();

    // Counts
    private int inputCount = 0;
    private int removedCount = 0;
    private int hanziCount = 0;
    private int uniqueHanziCount = 0;
    private int processedCount = 0;

    // outputs
    private String summary = "";
    private String csvOutput = "";
    private String tsvOutput = "";
    private String txtOutput = "";

    // inputs
    private String input = "";

    /**
     * Builds this object.
     */
    public ChineseFrequency() {
        init();
    }

    /**
     * Builds this object with an input String.
     * 
     * @param text input
     */
    public ChineseFrequency(String str) {
        init();
        setInput(str);
    }

    /**
     * Returns the string representation of this object.
     */
    @Override
    public String toString() {
        return getSummary();
    }

    /**
     * Returns the summary of counts based on input text.
     * 
     * @return a summary of character counts
     */
    public String getSummary() {
        return this.summary;
    }

    /**
     * Returns the CSV-formatted output of the Chinese character frequency counts. First row displays
     * the headers "hz,py,freq", followed by more rows of data.
     * 
     * @return text output
     */
    public String getCsvOutput() {
        return this.csvOutput;
    }

    /**
     * Returns the TSV-formatted output of the Chinese character frequency counts. First row displays
     * the headers "hz\tpy\tfreq", followed by more rows of data.
     * 
     * @return text output
     */
    public String getTsvOutput() {
        return this.tsvOutput;
    }

    /**
     * Returns padded text output of the Chinese character frequency counts. Rows padded to match
     * summary.
     * 
     * @return text output
     */
    public String getTxtOutput() {
        return this.txtOutput;
    }

    /**
     * Sets the Hanzi text to be analyzed. Then, runs Chinese character frequency analysis.
     * 
     * @param String text input
     * @return this object
     * @throws NullPointerException
     */
    public final ChineseFrequency setInput(String str) throws NullPointerException {

        if (str == null) {
            throw new NullPointerException();
        }

        this.input = str;

        analyze();
        return this;
    }

    /**
     * Returns total number of characters in input text.
     * 
     * @return number of input characters
     */
    public int getInputCount() {
        return this.inputCount;
    }

    /**
     * Returns total number of characters removed by input text. The characters removed comprise
     * alphanumeric characters, punctuation, and other symbols that should not be counted.
     * 
     * @return number of characters removed from input text
     */
    public int getRemovedCount() {
        return this.removedCount;
    }

    /**
     * Returns total number of Hanzi (Chinese characters) ready for counting.
     * 
     * @return number of Chinese characters remaining after pre-processing
     */
    public int getHanziCount() {
        return this.hanziCount;
    }

    /**
     * Returns total number of unique Hanzi that will be counted.
     * 
     * @return number of unique Chinese characters
     */
    public int getUniqueHanziCount() {
        return this.uniqueHanziCount;
    }

    /**
     * Returns total number of actual Hanzi processed. Functions as a checksum. <i>Should match number
     * of unique Hanzi.</i> Otherwise, the character (or other extra data) has not been mapped by the
     * idx.
     * 
     * @return number of Chinese characters processed
     */
    public int getProcessedCount() {
        return this.processedCount;
    }

    /**
     * Returns list of unmapped characters.
     * 
     * @return a String list of characters that were not processed
     */
    public Set<String> getUnmappedCharacters() {
        return this.unmappedCharacters;
    }

    private void analyze() {

        int inputCount = 0;
        int removedCount = 0;
        int hanziCount = 0;
        int uniqueHanziCount = 0;
        int processedCount = 0;

        int freq = 0;

        String csvOutput = this.HEADER_ROW_CSV;
        String tsvOutput = this.HEADER_ROW_TSV;
        String txtOutput = this.HEADER_ROW_TXT;

        String csv, tsv, txt;
        String str, input, pinyin, hanzi;
        Scanner sc;
        List<String> hanziList;
        Map<String, Integer> freqMap;
        JSONObject hpdx;
        String[] arr;

        Set<String> unmappedCharacters;

        hpdx = this.hpdx;

        input = this.input;
        inputCount = input.length();

        input = retainHanzi(input);
        removedCount = inputCount - input.length();

        hanziCount = input.length();

        sc = new Scanner(input);
        sc.useDelimiter("");

        hanziList = new ArrayList<String>();
        freqMap = new HashMap<String, Integer>();

        // counts occurrences
        while (sc.hasNext()) {

            str = sc.next();
            hanziList.add(str);

            if (freqMap.containsKey(str)) {
                freqMap.put(str, (Integer) freqMap.get(str).intValue() + 1);
            } else {
                freqMap.put(str, 1);
            }
        }

        // done with Scanner
        sc.close();

        uniqueHanziCount = freqMap.keySet().size();

        SortedMap<String, String> freqTreeMap = new TreeMap<String, String>(Collections.reverseOrder());

        unmappedCharacters = new HashSet<String>();
        for (Entry<String, Integer> counts : freqMap.entrySet()) {

            try {

                hanzi = counts.getKey();
                pinyin = hpdx.getString(hanzi);

            } catch (JSONException je) {

                // add this unmapped character to the list
                unmappedCharacters.add(counts.getKey());

                // not idx mapped yet. that's ok. move on.
                continue;
            }

            if (pinyin.isEmpty()) {
                // if character is unmapped in idx, do not process.
                continue;
            }

            freq = counts.getValue();

            freqTreeMap.put(String.format("%" + this.PADSIZE_FREQ + "s", freq).replace(' ', '0') + "-" + hanzi + "-"
                    + pinyin, hanzi + "," + pinyin + "," + freq);
            processedCount++;
        }

        // outputs
        for (Entry<String, String> outputs : freqTreeMap.entrySet()) {

            csv = this.CRLF + outputs.getValue();
            csvOutput += csv;

            tsv = csv.replaceAll(",", "\t");
            tsvOutput += tsv;

            arr = csv.split(",");

            // arr[0] is hanzi. arr[1] is pinyin. arr[2] is freq.
            txt = padSummary(arr[0] + " [" + arr[1] + "]", this.PADSIZE_SUMMARY + 1) + arr[2];
            txtOutput += txt;
        }

        // cleanup
        csvOutput = csvOutput.trim();
        tsvOutput = tsvOutput.trim();
        txtOutput = txtOutput.trim();

        // post-process
        this.csvOutput = csvOutput;
        this.tsvOutput = tsvOutput;
        this.txtOutput = txtOutput;

        // counts
        this.inputCount = inputCount;
        this.removedCount = removedCount;
        this.hanziCount = hanziCount;
        this.uniqueHanziCount = uniqueHanziCount;
        this.processedCount = processedCount;

        this.unmappedCharacters = unmappedCharacters;

        // summary
        String summaryString = "";

        summaryString += padSummary(this.MSG_TOTAL_COUNT, this.PADSIZE_SUMMARY) + inputCount;
        summaryString += this.CRLF + padSummary(this.MSG_REMOVED_COUNT, this.PADSIZE_SUMMARY) + removedCount;
        summaryString += this.CRLF + padSummary(this.MSG_HANZI_COUNT, this.PADSIZE_SUMMARY) + hanziCount;
        summaryString += this.CRLF + padSummary(this.MSG_UNIQUE_COUNT, this.PADSIZE_SUMMARY) + uniqueHanziCount;
        summaryString += this.CRLF + padSummary(this.MSG_PROCESSED_COUNT, this.PADSIZE_SUMMARY) + processedCount;

        this.summary = summaryString;
    }

    // pads text right
    private String padSummary(String str, int size) {
        return String.format("%-" + size + "s", str).replace(' ', ' ') + " : ";
    }

    // removals non-Hanzi characters.
    private String retainHanzi(String str) {

        Iterator<?> keys;
        String extra;

        // more elegant punctuation removal solution found here:
        // http://stackoverflow.com/a/9881012

        // removes ASCII letters, numbers, and punctuation
        str = str.replaceAll("[a-zA-Z0-9]|[\\p{Punct}]", "");
        keys = this.xpdx.keys();

        // remove all extra characters
        while (keys.hasNext()) {
            extra = (String) keys.next();
            str = str.replace(extra, "");
        }

        // remove all spaces
        return str.replaceAll("\\s{1,}", "");
    }

    // startup method
    private void init() {

        try {

            // load idx files
            this.xpdx = loadIdx(this.xpjson);
            this.hpdx = loadIdx(this.hpjson);

        } catch (Exception ex) {
            System.err.println(this.MSG_FILE_NOT_LOADED + ex.getMessage());
        }
    }

    // loads JSON idx files into JSONObjects
    private JSONObject loadIdx(String str) {

        JSONObject jo;
        InputStream is;

        is = getClass().getResourceAsStream(str);
        jo = new JSONObject(new JSONTokener(is));

        return jo;
    }
}