com.icantrap.collections.dawg.DawgBuilder.java Source code

Java tutorial

Introduction

Here is the source code for com.icantrap.collections.dawg.DawgBuilder.java

Source

// LICENSE: GPLv3. http://www.gnu.org/licenses/gpl-3.0.txt

package com.icantrap.collections.dawg;

import org.apache.commons.io.LineIterator;

import java.io.*;
import java.util.*;

/**
 * This class builds a dawg from scratch.  It does this by adding all the words to a trie.  When it's time to build, the
 * trie is compressed into a dawg.
 */

class DawgBuilder {
    private final Node root = new Node((char) 0);
    private int wordCount = 0;

    public DawgBuilder() {
    }

    // maybe.  once a dawg is built, you can't add words to it.

    /**
     * Adds a new word to the dawg.  Check for duplicate entry first.  Will not add null or words under two characters.
     *
     * @param word the word to add
     * @return the builder.
     */
    public DawgBuilder add(String word) {
        if ((null == word) || (word.length() < 2))
            return this;

        word = word.toUpperCase();

        char[] chars = word.toCharArray();
        Node ptr = root;
        boolean found = true;

        for (char ch : chars) {
            Node node = ptr.findChild(ch);

            if (null != node)
                ptr = node;
            else {
                found = false;
                ptr = ptr.addChild(ch);
            }
        }

        if (found && ptr.terminal)
            return this;
        else {
            ptr.terminal = true;
            ++wordCount;
            return this;
        }
    }

    /**
     * Adds a collection of words to the dawg.  Delegates to add.
     * 
     * @param words the array of words to add
     * @return the builder
     * @see DawgBuilder#add(String)
     */
    public DawgBuilder add(String[] words) {
        for (String word : words)
            add(word);

        return this;
    }

    /**
     * Adds a collection of words to the dawg
     * @param words the collecton of words to add
     * @return the builder
     * @see DawgBuilder#add(String)
     */
    public DawgBuilder add(Collection<String> words) {
        for (String word : words)
            add(word);

        return this;
    }

    /**
     * Adds words from a newline-delimited file using LineIterator.
     *
     * @param wordIter the line iterator
     * @return the builder
     * @see LineIterator
     */
    public DawgBuilder add(LineIterator wordIter) {
        while (wordIter.hasNext())
            add(wordIter.next());

        return this;
    }

    /**
     * Adds words from a newline-delimited InputStream.
     *
     * @param is the stream with the words
     * @return the builder
     * @throws java.io.IOException if reading from the InputStream causes an IOException
     */
    public DawgBuilder add(InputStream is) throws IOException {
        return add(new InputStreamReader(is));
    }

    /**
     * Adds words from a newline-delimited Reader.
     *
     * @param reader the reader with the words
     * @return the builder
     * @throws java.io.IOException if reading from the Reader causes an IOException
     */
    public DawgBuilder add(Reader reader) throws IOException {
        BufferedReader br = new BufferedReader(reader, 12 * 1024);
        String line;

        while ((line = br.readLine()) != null)
            add(line);

        return this;
    }

    /**
     * The number of words - so far - that will be built into this Dawg.
     *
     * @return the number of words.
     */
    public int wordCount() {
        return wordCount;
    }

    /**
     * The number of nodes - currently - in the structure that will become the Dawg.
     * @return the number of nodes
     */
    public int nodeCount() {
        int nodeCount = 0;
        Deque<Node> stack = new LinkedList<Node>();
        stack.push(root);

        while (!stack.isEmpty()) {
            Node ptr = stack.pop();
            ++nodeCount;

            for (Node nextChild : ptr.nextChildren)
                stack.push(nextChild);
            if (null != ptr.child)
                stack.push(ptr.child);
        }

        return nodeCount;
    }

    /**
     * Returns whether the word would be contained in the Dawg being built.
     *
     * @param word the word to check
     * @return true, if the word is contained; false, otherwise.
     */
    boolean contains(String word) {
        if ((null == word) || (word.length() < 2))
            return false;

        word = word.toUpperCase();

        char[] chars = word.toCharArray();
        Node ptr = root;

        for (char ch : chars) {
            ptr = ptr.findChild(ch);
            if (null == ptr)
                return false;
        }

        return ptr.terminal;
    }

    // compression internals
    private List<Node> nodeList = new ArrayList<Node>();
    private Map<Integer, LinkedList<Node>> childDepths = new LinkedHashMap<Integer, LinkedList<Node>>();

    private void compress() {
        LinkedList<Node> stack = new LinkedList<Node>();
        int index = 0;

        stack.addLast(root);
        while (!stack.isEmpty()) {
            Node ptr = stack.removeFirst();

            ptr.index = index++;
            if (root != ptr)
                ptr.siblings = ptr.parent.nextChildren.size() - 1 + (null == ptr.parent.child ? 0 : 1);
            nodeList.add(ptr);

            for (Node nextChild : ptr.nextChildren)
                stack.add(nextChild);
            if (null != ptr.child)
                stack.add(ptr.child);
        }

        // assign child depths to all nodes
        for (Node node : nodeList)
            if (node.terminal) {
                node.childDepth = 0;

                Node ptr = node;
                int depth = 0;
                while (root != ptr) {
                    ptr = ptr.parent;
                    ++depth;
                    if (depth > ptr.childDepth)
                        ptr.childDepth = depth;
                    else
                        break;
                }
            }

        // bin nodes by child depth
        for (Node node : nodeList) {
            LinkedList<Node> nodes = childDepths.get(node.childDepth);
            if (null == nodes) {
                nodes = new LinkedList<Node>();
                nodes.add(node);
                childDepths.put(node.childDepth, nodes);
            } else
                nodes.add(node);
        }

        int maxDepth = -1;
        for (int depth : childDepths.keySet())
            if (depth > maxDepth)
                maxDepth = depth;

        for (int depth = 0; depth <= maxDepth; ++depth) {
            LinkedList<Node> nodes = childDepths.get(depth);
            if (null == nodes)
                continue;

            for (ListIterator<Node> pickNodeIter = nodes.listIterator(); pickNodeIter.hasNext();) {
                Node pickNode = pickNodeIter.next();

                if ((null == pickNode.replaceMeWith) && pickNode.isChild && (0 == pickNode.siblings))
                    for (ListIterator<Node> searchNodeIter = nodes
                            .listIterator(pickNodeIter.nextIndex()); searchNodeIter.hasNext();) {
                        Node searchNode = searchNodeIter.next();
                        if ((null == searchNode.replaceMeWith) && searchNode.isChild && (0 == searchNode.siblings)
                                && pickNode.equals(searchNode)) {
                            searchNode.parent.child = pickNode;
                            searchNode.replaceMeWith = pickNode;
                        }
                    }
            }
        }
    }

    /**
     * Builds the dawg based on the words added.
     *
     * @return the new Dawg instance
     */
    public Dawg build() {
        compress();

        for (Node node : nodeList)
            node.index = -1;

        LinkedList<Node> stack = new LinkedList<Node>();

        nodeList.clear();
        stack.clear();
        stack.addLast(root);

        int index = 0;

        while (!stack.isEmpty()) {
            Node ptr = stack.removeFirst();
            if (-1 == ptr.index)
                ptr.index = index++;
            nodeList.add(ptr);

            for (Node nextChild : ptr.nextChildren)
                stack.addLast(nextChild);
            if (null != ptr.child)
                stack.addLast(ptr.child);
        }

        int[] ints = new int[index];

        for (Node node : nodeList)
            ints[node.index] = node.toInteger();

        return new Dawg(ints);
    }

    public static void main(String[] args) throws IOException {
        if (args.length != 2) {
            System.out.println("Usage:  DawgBuilder infilename outfilename");
            System.out.println("  infilename - filename of newline-delimited list of words to put in the DAWG.");
            System.out
                    .println("  outfilename - filename of new file to be created containing the binary DAWG data.");

            return;
        }

        String infilename = args[0];
        String outfilename = args[1];

        FileReader reader = new FileReader(infilename);
        DawgBuilder builder = new DawgBuilder();
        Dawg dawg = builder.add(reader).build();
        dawg.store(new FileOutputStream(outfilename));
    }
}