Back to project page droidling.
The source code is released under:
Copyright (c) 2012 Keith Trnka Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Softwa...
If you think the Android project droidling listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.
package com.github.ktrnka.droidling; /* www . java2 s . co m*/ import static com.github.ktrnka.droidling.Tokenizer.tokenize; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.EOFException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Locale; public class WordDistribution { private int total; private HashMap<String, int[]> counts; public WordDistribution() { counts = new HashMap<String, int[]>(); total = 0; } public WordDistribution(InputStream fin, boolean isBinary) throws IOException { this(); if (!isBinary) { BufferedReader in = new BufferedReader(new InputStreamReader(fin, Charset.forName("UTF-8")), 8192); String line = in.readLine(); total = Integer.parseInt(line); while ((line = in.readLine()) != null) { int tab = line.indexOf('\t'); if (tab != -1) { String word = line.substring(0, tab); String numeric = line.substring(tab + 1); counts.put(word, new int[] { Integer.parseInt(numeric) }); } // note: I originally used String.split for this but the // performance was AWFUL (like 0.7s vs 2.4s) } in.close(); } else { DataInputStream in = new DataInputStream(new BufferedInputStream(fin)); try { total = in.readInt(); String word; while ((word = in.readUTF()) != null) { counts.put(word, new int[] { in.readInt() }); } in.close(); } catch (EOFException exc) { // this is normal for some awful reason } finally { in.close(); } } } public void train(String filename) throws IOException { BufferedReader in = new BufferedReader(new FileReader(filename), 8192); // compute counts String line; while ((line = in.readLine()) != null) { // hack for Enron texts line = line.replaceFirst(".*\t", ""); ArrayList<String> tokens = tokenize(line); for (String token : tokens) { token = token.toLowerCase(Locale.getDefault()); if (counts.containsKey(token)) counts.get(token)[0]++; else counts.put(token, new int[] { 1 }); } } // compute total for (int[] value : counts.values()) total += value[0]; in.close(); } /** * Save to the specified file as a UTF-8 list of one word then frequency per * line, with a total at the top * * @param filename * @throws IOException */ public void save(String filename) throws IOException { ArrayList<String> words = getSortedWords(); PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(filename), Charset.forName("UTF-8"))); out.println(total); for (String word : words) { out.println(word + "\t" + counts.get(word)[0]); } out.close(); } public void saveBinary(String filename) throws IOException { ArrayList<String> words = getSortedWords(); DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream( filename))); out.writeInt(total); for (String word : words) { out.writeUTF(word); out.writeInt(counts.get(word)[0]); } out.close(); } /** * Compute the list of words, ordered descening by frequency. * * @return */ public ArrayList<String> getSortedWords() { ArrayList<String> words = new ArrayList<String>(counts.keySet()); Collections.sort(words, new Comparator<String>() { public int compare(String a, String b) { return counts.get(b)[0] - counts.get(a)[0]; } }); return words; } public double expectedFrequency(String word1, String word2, double localTotal) { return getSmoothProb(word1) * getSmoothProb(word2) * localTotal; } public double expectedFrequency(String word, double localTotal) { return getSmoothProb(word) * localTotal; } public double expectedFrequency(String w1, String w2, String w3, double localTotal) { return getSmoothProb(w1) * getSmoothProb(w2) * getSmoothProb(w3) * localTotal; } public double getSmoothProb(String word) { if (counts.containsKey(word)) return counts.get(word)[0] / (double) total; else return 0.5 / total; } /** * Train a unigram model from text. Sadly, I can't run this from the Eclispe * Android project or I don't know how to. * * @param args */ public static void main(String[] args) throws Exception { WordDistribution dist = new WordDistribution(); dist.train("C:/Users/keith.trnka/Documents/corpora/enronmobile/enronmobile/mobile_orig_simple.txt"); dist.save("C:/Users/keith.trnka/workspace/PersonalLinguistics/assets/unigrams.utf8.txt"); dist.saveBinary("C:/Users/keith.trnka/workspace/PersonalLinguistics/assets/unigrams.bin"); } }