edu.illinois.cs.cogcomp.nlp.tokenizer.HashCollisionReport.java Source code

Introduction

Here is the source code for edu.illinois.cs.cogcomp.nlp.tokenizer.HashCollisionReport.java
Source

/**
 * This software is released under the University of Illinois/Research and Academic Use License. See
 * the LICENSE file in the root folder for details. Copyright (c) 2016
 *
 * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign
 * http://cogcomp.cs.illinois.edu/
 */
/**
 * 
 */
package edu.illinois.cs.cogcomp.nlp.tokenizer;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;

import org.apache.commons.io.FileUtils;

import edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder;
import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer;
import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder;

/**
 * This class will identify hash collisions within a set of documents. It is provided a directory
 * full of text documents. It will simply tokenize the the documents, and check the resulting
 * constituents for hash collisions. Collisions are reported.
 * @author redman
 */
public class HashCollisionReport {

    /**
     * report the error and exit.
     * @param message the error message.
     */
    private static void error(String message) {
        System.err.println(message);
        System.exit(-1);
    }

    /**
     * Read each test file in the directory, tokenize and create the token view. Then check for
     * collisions.
     * @param args
     * @throws IOException 
     */
    public static void main(String[] args) throws IOException {
        if (args.length == 0)
            error("Must pass in the name of a directory with files to test against.");
        File dir = new File(args[0]);
        if (!dir.exists()) {
            error("The directory did not exist : " + dir);
        }
        if (!dir.isDirectory()) {
            error("The path was not a directory : " + dir);
        }
        File[] files = dir.listFiles();
        for (File file : files) {
            if (file.isFile()) {
                String normal = FileUtils.readFileToString(file);
                TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
                TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
                List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
                HashMap<Integer, Constituent> hashmap = new HashMap<>();

                // add each constituent to the map keyed by it's hashcode. Check first to see if the hashcode
                // is already used, if it is report it.
                for (Constituent c : normalToks) {
                    int code = c.hashCode();
                    if (hashmap.containsKey(code)) {
                        Constituent dup = hashmap.get(code);
                        System.err.println(c + " == " + dup);
                    } else {
                        hashmap.put(code, c);
                    }
                }
            }
        }

    }
}