Java tutorial
package com.zbt.trie.linklist; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.concurrent.locks.ReentrantLock; import java.util.regex.Pattern; import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.commons.lang3.builder.ToStringStyle; /* Copyright 2010 Michael Clerx work@michaelclerx.com This file is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this file. If not, see <http://www.gnu.org/licenses/>. */ /** * Represents an ordered list of words used for no other task than to look up if * a word is on the list. The words are sorted by Unicode value. * * Note: The structure uses a delimiter to signal the end of words internally, * this means the delimiter may never occur in the words passed to the list. * This is the user's own responsibility. * * This data structure is not thread safe in any way. */ public class SimpleTrie extends ReentrantLock implements Serializable { /** * */ private static final long serialVersionUID = 1296217675133313418L; /** * The delimiter used in this word to tell where words end. Without a proper * delimiter either A. a lookup for 'win' would return false if the list * also contained 'windows', or B. a lookup for 'mag' would return true if * the only word in the list was 'magnolia' * * The delimiter should never occur in a word added to the trie. */ public final static char DELIMITER = '\u0001'; /** * Creates a new case sensitive Trie * * @param ignoreCase */ public SimpleTrie() { this(false); } /** * Creates a new Trie. * * @param ignoreCase * Set this to true to make the trie ignore case (warning: this * slows down performance considerably!) */ public SimpleTrie(boolean ignoreCase) { root = new Node('r'); size = 0; this.ignoreCase = ignoreCase; } public boolean addWord(char[] word, int start, int end) { if (word == null || start == end) throw new IllegalArgumentException("Word can't be empty"); end++; if (end > word.length) { char[] dest = new char[end]; System.arraycopy(word, start, dest, 0, end - start - 1); word = dest; } word[end - 1] = DELIMITER; if (add(root, word, start, end)) { size++; int n = end - start; if (n > maxDepth) maxDepth = n; return true; } return false; } private boolean add(Node root, char[] word, int start, int end) { if (start == end) { root.num++; return false; } int c = word[start]; // Search for node to add to Node last = null, next = root.firstChild; while (next != null) { if (next.value < c) { // Not found yet, continue searching last = next; next = next.nextSibling; } else if (next.value == c) { // Match found, add remaining word to this node return add(next, word, start + 1, end); } // Because of the ordering of the list getting here means we won't // find a match else break; } // No match found, create a new node and insert Node node = new Node(c); if (last == null) { // Insert node at the beginning of the list (Works for next == null // too) root.firstChild = node; node.nextSibling = next; } else { // Insert between last and next last.nextSibling = node; node.nextSibling = next; } // Add remaining letters for (int i = start + 1; i < end; i++) { node.firstChild = new Node(word[i]); node = node.firstChild; } return true; } /** * Adds a word to the list. * * @param word * The word to add. * @return True if the word wasn't in the list yet */ public boolean add(String word) { // lock(); try { if (word.length() == 0) throw new IllegalArgumentException("Word can't be empty"); String w = (ignoreCase) ? word.toLowerCase() : word; if (add(root, w + DELIMITER, 0)) { size++; int n = word.length(); if (n > maxDepth) maxDepth = n; return true; } } finally { // unlock(); } return false; } /** * Adds an array of words to the list. * * @param word * The words to add. */ public void addAll(String[] words) { for (String word : words) add(word); } /* * Does the real work of adding a word to the trie */ private boolean add(Node root, String word, int offset) { if (offset == word.length()) { root.num++; return false; } int c = word.charAt(offset); // Search for node to add to Node last = null, next = root.firstChild; while (next != null) { if (next.value < c) { // Not found yet, continue searching last = next; next = next.nextSibling; } else if (next.value == c) { // Match found, add remaining word to this node return add(next, word, offset + 1); } // Because of the ordering of the list getting here means we won't // find a match else break; } // No match found, create a new node and insert Node node = new Node(c); if (last == null) { // Insert node at the beginning of the list (Works for next == null // too) root.firstChild = node; node.nextSibling = next; } else { // Insert between last and next last.nextSibling = node; node.nextSibling = next; } // Add remaining letters for (int i = offset + 1; i < word.length(); i++) { node.firstChild = new Node(word.charAt(i)); node = node.firstChild; } return true; } /** * Removes a word from the list. * * @param word * The word to remove. * @return True if the word was found and removed. */ public boolean remove(String word) { if (word.length() == 0) throw new IllegalArgumentException("Word can't be empty"); String w = (ignoreCase) ? word.toLowerCase() : word; if (remove(root, w + DELIMITER, 0, null, null, null)) { size--; return true; } return false; } /* * Removes a word from the list: searches for the word while retaining * information about the last time it encountered a tree branch, and which * branch of the tree it followed... */ private boolean remove(Node root, String word, int offset, Node branch, Node branchLast, Node branchNext) { if (offset == word.length()) { // Word found, delete entry at last branch if (branch == null) { // No branches found in the tree, only one word! this.root.firstChild = null; } else { if (branchLast == null) branch.firstChild = branchNext; else branchLast.nextSibling = branchNext; } return true; } // Search for word int c = word.charAt(offset); Node last = null, next = root.firstChild; while (next != null) { if (next.value < c) { last = next; next = next.nextSibling; } else if (next.value == c) { // Test if this node had more than one child if (last != null || next.nextSibling != null) { branch = root; branchLast = last; branchNext = next.nextSibling; } return remove(next, word, offset + 1, branch, branchLast, branchNext); } else return false; } return false; } /** * Searches for a word in the list. * * @param word * The word to search for. * @return True if the word was found. */ public boolean isEntry(String word) { if (word.length() == 0) throw new IllegalArgumentException("Word can't be empty"); String w = (ignoreCase) ? word.toLowerCase() : word; return isEntry(root, w + DELIMITER, 0); } /* * Does the real work of determining if a word is in the list */ private boolean isEntry(Node root, String word, int offset) { if (offset == word.length()) return true; int c = word.charAt(offset); // Search for node to add to Node next = root.firstChild; while (next != null) { if (next.value < c) next = next.nextSibling; else if (next.value == c) return isEntry(next, word, offset + 1); else return false; } return false; } /** * Returns all words in the list. If (ignoreCase=true) all words are * returned in lower case. For large lists this will be a fairly slow * operation. * * @return A String array of all words in the list */ public String[] getAll() { ArrayList<String> words = new ArrayList<String>(size); char[] chars = new char[maxDepth]; getAll(root, words, chars, 0); return words.toArray(new String[size]); } /* * Adds any words found in this branch to the array */ private void getAll(Node root, ArrayList<String> words, char[] chars, int pointer) { Node n = root.firstChild; while (n != null) { if (n.firstChild == null) { words.add(new String(chars, 0, pointer)); } else { chars[pointer] = (char) n.value; getAll(n, words, chars, pointer + 1); } n = n.nextSibling; } } public List<Word> getAllWords() { List<Word> words = new ArrayList<Word>(30000); char[] chars = new char[maxDepth]; getAllWords(root, words, chars, 0); return words; } private void getAllWords(Node root, List<Word> words, char[] chars, int pointer) { Node n = root.firstChild; while (n != null) { if (n.firstChild == null) { int num = n.num + 1; Word pair = new Word(new String(chars, 0, pointer), num); words.add(pair); } else { chars[pointer] = (char) n.value; getAllWords(n, words, chars, pointer + 1); } n = n.nextSibling; } } /** * Returns the size of this list; */ public int size() { return size; } /** * Returns all words in this list starting with the given prefix * * @param prefix * The prefix to search for. * @return All words in this list starting with the given prefix, or if no * such words are found, an array containing only the suggested * prefix. */ public String[] suggest(String prefix) { return suggest(root, prefix, 0); } /* * Recursive function for finding all words starting with the given prefix */ private String[] suggest(Node root, String word, int offset) { if (offset == word.length()) { ArrayList<String> words = new ArrayList<String>(size); char[] chars = new char[maxDepth]; for (int i = 0; i < offset; i++) chars[i] = word.charAt(i); getAll(root, words, chars, offset); return words.toArray(new String[words.size()]); } int c = word.charAt(offset); // Search for node to add to Node next = root.firstChild; while (next != null) { if (next.value < c) next = next.nextSibling; else if (next.value == c) return suggest(next, word, offset + 1); else break; } return new String[] {}; } /** * Returns a clone of this trie * * @return A clone of this trie */ public SimpleTrie clone() { SimpleTrie clone = new SimpleTrie(ignoreCase); cloneKids(root, clone.root); clone.size = size; clone.maxDepth = maxDepth; return clone; } // Recursively clones the children of a node private void cloneKids(Node root, Node rootClone) { Node n = root.firstChild; Node nClone = null, insertAfter = null; while (n != null) { // Insert copy of node into clone nClone = new Node(n.value); if (insertAfter == null) rootClone.firstChild = nClone; else insertAfter.nextSibling = nClone; // Repeat for child node if (n.firstChild != null) cloneKids(n, nClone); // Move to next sibling n = n.nextSibling; insertAfter = nClone; } } /** * Searches a string for words present in the trie and replaces them with * stars (asterixes). * * @param z * The string to censor */ public String censor(String s) { if (size == 0) return s; String z = s.toLowerCase(); int n = z.length(); StringBuilder buffer = new StringBuilder(n); int match; char star = '*'; for (int i = 0; i < n;) { match = longestMatch(root, z, i, 0, 0); if (match > 0) { for (int j = 0; j < match; j++) { buffer.append(star); i++; } } else { buffer.append(s.charAt(i++)); } } return buffer.toString(); } /* * Finds the longest matching word in the trie that starts at the given * offset... */ private int longestMatch(Node root, String word, int offset, int depth, int maxFound) { // Uses delimiter = first in the list! Node next = root.firstChild; if (next.value == DELIMITER) maxFound = depth; if (offset == word.length()) return maxFound; int c = word.charAt(offset); while (next != null) { if (next.value < c) next = next.nextSibling; else if (next.value == c) return longestMatch(next, word, offset + 1, depth + 1, maxFound); else return maxFound; } return maxFound; } /* * Represents a node in the trie. Because a node's children are stored in a * linked list this data structure takes the odd structure of node with a * firstChild and a nextSibling. */ private class Node implements Serializable { /** * */ private static final long serialVersionUID = -3282374617523335887L; public int value; public Node firstChild; public Node nextSibling; public int num; public Node(int value) { this.value = value; firstChild = null; nextSibling = null; } public String toString() { return ToStringBuilder.reflectionToString(this, ToStringStyle.MULTI_LINE_STYLE); } } /** * Reads words from a file into the trie. The file is read one line at a * time, a line may contain multiple words separated by whitespace. * * @param file * The file to read * @param strip1 * When added the file will replace any instances of 'strip' from * the file. This can be used to remove quotes or turn commas * into spaces. * @param strip2 * Like strip1 */ public void addFile(String file, String strip1, String strip2) throws IOException { BufferedReader in = new BufferedReader(new FileReader(file)); String line; String[] words; Pattern whitespace = Pattern.compile("\\s+"); Pattern quote1 = Pattern.compile(strip1); Pattern quote2 = Pattern.compile(strip2); String empty = " "; while ((line = in.readLine()) != null) { line = quote1.matcher(line).replaceAll(empty); // line = quote2.matcher(line).replaceAll(empty); words = whitespace.split(line); for (String w : words) if (w.length() > 0) if (!quote2.matcher(w.toLowerCase()).matches()) add(w); } in.close(); } /** * Writes this trie's contents to a file * * @param file * The path to write to * @param delimiter * Every word printed is followed by this delimiter * @param quotes * Every word printed is wrapped in this string */ public void writeFile(String file, String delimiter, String quoteMark) throws IOException { BufferedWriter out = new BufferedWriter(new FileWriter(file)); char[] c = new char[maxDepth]; writeNode(root, out, c, 0, quoteMark, quoteMark + delimiter); out.close(); } private static void writeNode(Node root, java.io.BufferedWriter out, char[] chars, int pointer, String pre, String post) throws java.io.IOException { Node n = root.firstChild; while (n != null) { if (n.firstChild == null) { out.write(pre); out.write(new String(chars, 0, pointer)); out.write(post); } else { chars[pointer] = (char) n.value; writeNode(n, out, chars, pointer + 1, pre, post); } n = n.nextSibling; } } private Node root; private int size; private int maxDepth; // Not exact, but bounding for the maximum private boolean ignoreCase = true; public String toString() { return ToStringBuilder.reflectionToString(this, ToStringStyle.MULTI_LINE_STYLE); } public static void main(String[] args) { char[] aa = { 's', 'h', 'e' }; char[] bb = { 's', 'h', 'e' }; char[] cc = { 's', 'h', 'e' }; char[] dd = { 't', 'e', 's', 't' }; SimpleTrie trie = new SimpleTrie(); trie.add("she"); trie.add("ccc"); trie.add("bbb"); trie.add("aaa"); trie.addWord(aa, 0, 3); trie.addWord(bb, 0, 3); trie.addWord(cc, 0, 3); trie.addWord(dd, 0, 4); String[] prefix = trie.suggest("te"); if (null != prefix) { for (int i = 0; i < prefix.length; i++) { System.out.println(prefix[i]); } } System.out.println(trie.getAllWords()); } }