Calculating Word Frequencies with Regular Expressions
import java.io.FileInputStream;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WordCount {
public static void main(String args[]) throws Exception {
String filename = "WordCount.java";
// Map File from filename to byte buffer
FileInputStream input = new FileInputStream(filename);
FileChannel channel = input.getChannel();
int fileLength = (int) channel.size();
MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0,
fileLength);
// Convert to character buffer
Charset charset = Charset.forName("ISO-8859-1");
CharsetDecoder decoder = charset.newDecoder();
CharBuffer charBuffer = decoder.decode(buffer);
// Create line pattern
Pattern linePattern = Pattern.compile(".*$", Pattern.MULTILINE);
// Create word pattern
Pattern wordBreakPattern = Pattern.compile("[\\p{Punct}\\s}]");
// Match line pattern to buffer
Matcher lineMatcher = linePattern.matcher(charBuffer);
Map map = new TreeMap();
Integer ONE = new Integer(1);
// For each line
while (lineMatcher.find()) {
// Get line
CharSequence line = lineMatcher.group();
// Get array of words on line
String words[] = wordBreakPattern.split(line);
// For each word
for (int i = 0, n = words.length; i < n; i++) {
if (words[i].length() > 0) {
Integer frequency = (Integer) map.get(words[i]);
if (frequency == null) {
frequency = ONE;
} else {
int value = frequency.intValue();
frequency = new Integer(value + 1);
}
map.put(words[i], frequency);
}
}
}
System.out.println(map);
}
}
Related examples in the same category