com.addthis.ahocorasick.TestAhoCorasick.java Source code

Java tutorial

Introduction

Here is the source code for com.addthis.ahocorasick.TestAhoCorasick.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.ahocorasick;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.Set;

import org.apache.commons.lang3.RandomStringUtils;

import org.junit.Test;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

public class TestAhoCorasick {

    @Test
    public void construction() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("hello", "hello");
        tree.add("hi", "hi");
        tree.prepare();

        State s0 = tree.getRoot();
        State s1 = s0.get('h');
        State s2 = s1.get('e');
        State s5 = s2.getFastTransitions()[3];
        State s6 = s1.get('i');

        assertEquals(s0, s1.getFail());
        assertNull(s2.getFail());
        assertEquals(s0, s2.getFastTransitions()[0]);
        assertEquals(s0, s2.getFastTransitions()[1]);
        assertEquals(s0, s2.getFastTransitions()[2]);
        assertEquals(s0, s5.getFail());
        assertEquals(s0, s6.getFail());

        assertEquals(0, s0.getOutputs().size());
        assertEquals(0, s1.getOutputs().size());
        assertEquals(0, s2.getOutputs().size());
        assertEquals(1, s5.getOutputs().size());
        assertEquals(1, s6.getOutputs().size());
    }

    @Test
    public void example() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("he", "he");
        tree.add("she", "she");
        tree.add("his", "his");
        tree.add("hers", "hers");
        assertEquals(10, tree.getRoot().size());
        tree.prepare(); // after prepare, we can't call size()
        State s0 = tree.getRoot();
        State s1 = s0.get('h');
        State s2 = s1.get('e');

        State s3 = s0.get('s');
        State s4 = s3.get('h');
        State s5 = s4.get('e');

        State s6 = s1.get('i');
        State s7 = s6.get('s');

        State s8 = s2.get('r');
        State s9 = s8.get('s');

        assertEquals(s0, s1.getFail());
        assertEquals(s0, s2.getFail());
        assertEquals(s0, s3.getFail());
        assertEquals(s0, s6.getFail());
        assertEquals(s0, s8.getFail());

        assertEquals(s1, s4.getFail());
        assertEquals(s2, s5.getFail());
        assertEquals(s3, s7.getFail());
        assertEquals(s3, s9.getFail());

        assertEquals(0, s1.getOutputs().size());
        assertEquals(0, s3.getOutputs().size());
        assertEquals(0, s4.getOutputs().size());
        assertEquals(0, s6.getOutputs().size());
        assertEquals(0, s8.getOutputs().size());
        assertEquals(1, s2.getOutputs().size());
        assertEquals(1, s7.getOutputs().size());
        assertEquals(1, s9.getOutputs().size());
        assertEquals(2, s5.getOutputs().size());
    }

    @Test
    public void startSearchWithSingleResult() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("apple", "apple");
        tree.prepare();
        SearchResult result = tree.startSearch("washington cut the apple tree");
        assertEquals(1, result.getOutputs().size());
        assertEquals("apple", result.getOutputs().iterator().next());
        assertEquals(24, result.lastIndex);
        assertEquals(null, tree.continueSearch(result));
    }

    @Test
    public void startSearchWithUnicodeResult() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("espaa", "espaa");
        tree.prepare();
        SearchResult result = tree.startSearch("la campeona del mundo de ftbol es espaa");
        assertEquals(1, result.getOutputs().size());
        assertEquals("espaa", result.getOutputs().iterator().next());
        assertEquals(41, result.lastIndex);
        assertEquals(null, tree.continueSearch(result));
    }

    @Test
    public void startSearchWithAdjacentResults() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("john", "john");
        tree.add("jane", "jane");
        tree.prepare();
        SearchResult firstResult = tree.startSearch("johnjane");
        SearchResult secondResult = tree.continueSearch(firstResult);
        assertEquals(null, tree.continueSearch(secondResult));
    }

    @Test
    public void startSearchOnEmpty() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("cipher", 0);
        tree.add("zip", 1);
        tree.add("nought", 2);
        tree.prepare();
        SearchResult result = tree.startSearch("");
        assertEquals(null, result);
    }

    @Test
    public void multipleOutputs() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("x", "x");
        tree.add("xx", "xx");
        tree.add("xxx", "xxx");
        tree.prepare();

        SearchResult result = tree.startSearch("xxx");
        assertEquals(1, result.lastIndex);
        assertEquals(new HashSet<Object>(Arrays.asList(new String[] { "x" })), result.getOutputs());

        result = tree.continueSearch(result);
        assertEquals(2, result.lastIndex);
        assertEquals(new HashSet<Object>(Arrays.asList(new String[] { "xx", "x" })), result.getOutputs());

        result = tree.continueSearch(result);
        assertEquals(3, result.lastIndex);
        assertEquals(new HashSet<Object>(Arrays.asList(new String[] { "xxx", "xx", "x" })), result.getOutputs());

        assertEquals(null, tree.continueSearch(result));
    }

    @Test
    public void iteratorInterface() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("moo", "moo");
        tree.add("one", "one");
        tree.add("on", "on");
        tree.add("ne", "ne");
        tree.prepare();
        Iterator<SearchResult> iter = tree.progressiveSearch("one moon ago");

        assertTrue(iter.hasNext());
        SearchResult r = iter.next();
        assertEquals(new HashSet<Object>(Arrays.asList(new String[] { "on" })), r.getOutputs());
        assertEquals(2, r.lastIndex);

        assertTrue(iter.hasNext());
        r = iter.next();
        assertEquals(new HashSet<Object>(Arrays.asList(new String[] { "one", "ne" })), r.getOutputs());
        assertEquals(3, r.lastIndex);

        assertTrue(iter.hasNext());
        r = iter.next();
        assertEquals(new HashSet<Object>(Arrays.asList(new String[] { "moo" })), r.getOutputs());
        assertEquals(7, r.lastIndex);

        assertTrue(iter.hasNext());
        r = iter.next();
        assertEquals(new HashSet<Object>(Arrays.asList(new String[] { "on" })), r.getOutputs());
        assertEquals(8, r.lastIndex);

        assertFalse(iter.hasNext());

        try {
            iter.next();
            fail();
        } catch (NoSuchElementException e) {
        }

    }

    @Test
    public void largerTextExample() {
        AhoCorasick tree = AhoCorasick.builder().build();
        String text = "The ga3 mutant of Arabidopsis is a gibberellin-responsive dwarf. We present data showing that the "
                + "ga3-1 mutant is deficient in ent-kaurene oxidase activity, the first cytochrome P450-mediated step "
                + "in"
                + " the gibberellin biosynthetic pathway. By using a combination of conventional map-based cloning and "
                + "random sequencing we identified a putative cytochrome P450 gene mapping to the same location as GA3."
                + " "
                + "Relative to the progenitor line, two ga3 mutant alleles contained single base changes generating "
                + "in-frame stop codons in the predicted amino acid sequence of the P450. A genomic clone spanning the "
                + "P450 locus complemented the ga3-2 mutant. The deduced GA3 protein defines an additional class of "
                + "cytochrome P450 enzymes. The GA3 gene was expressed in all tissues examined, RNA abundance being "
                + "highest in inflorescence tissue.";
        String[] terms = { "microsome", "cytochrome", "cytochrome P450 activity", "gibberellic acid biosynthesis",
                "GA3", "cytochrome P450", "oxygen binding", "AT5G25900.1", "protein", "RNA", "gibberellin",
                "Arabidopsis", "ent-kaurene oxidase activity", "inflorescence", "tissue", };
        for (int i = 0; i < terms.length; i++) {
            tree.add(terms[i], terms[i]);
        }
        tree.prepare();

        Set<Object> termsThatHit = new HashSet<Object>();
        Iterator<SearchResult> iter = tree.progressiveSearch(text);
        while (iter.hasNext()) {
            SearchResult result = iter.next();
            termsThatHit.addAll(result.getOutputs());
        }
        assertEquals(
                new HashSet<Object>(Arrays.asList(
                        new String[] { "cytochrome", "GA3", "cytochrome P450", "protein", "RNA", "gibberellin",
                                "Arabidopsis", "ent-kaurene oxidase activity", "inflorescence", "tissue", })),
                termsThatHit);

    }

    // Without overlapping
    @Test
    public void removeOverlapping1() {
        AhoCorasick tree = AhoCorasick.builder().build();
        List<OutputResult> outputResults = new ArrayList<OutputResult>();
        outputResults.add(new OutputResult(0, 0, 2));
        outputResults.add(new OutputResult(1, 2, 4));
        outputResults.add(new OutputResult(2, 5, 6));

        assertEquals(3, outputResults.size());
        tree.removeOverlapping(outputResults); // No effect
        assertEquals(3, outputResults.size());
    }

    // With a clear overlapping
    @Test
    public void removeOverlapping2() {
        AhoCorasick tree = AhoCorasick.builder().build();
        List<OutputResult> outputResults = new ArrayList<OutputResult>();
        outputResults.add(new OutputResult(0, 0, 2));
        outputResults.add(new OutputResult(1, 1, 4));
        outputResults.add(new OutputResult(2, 5, 6));

        assertEquals(3, outputResults.size());
        tree.removeOverlapping(outputResults);
        assertEquals(2, outputResults.size());
        assertEquals(0, outputResults.get(0).getOutput());
        assertEquals(2, outputResults.get(1).getOutput());
    }

    // With two overlapping, one with the same start index
    @Test
    public void removeOverlapping3() {
        AhoCorasick tree = AhoCorasick.builder().build();
        List<OutputResult> outputResults = new ArrayList<OutputResult>();
        outputResults.add(new OutputResult(0, 0, 2));
        outputResults.add(new OutputResult(1, 0, 4));
        outputResults.add(new OutputResult(2, 3, 6));

        assertEquals(3, outputResults.size());
        tree.removeOverlapping(outputResults);
        assertEquals(1, outputResults.size());
        assertEquals(1, outputResults.get(0).getOutput());
    }

    @Test
    public void completeSearchNotOverlapping() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("Apple");
        tree.add("App");
        tree.add("Microsoft");
        tree.add("Mic");
        tree.prepare();

        String inputText = "Apple is better than Microsoft";
        List<OutputResult> results = tree.completeSearch(inputText, false, false);

        assertEquals(2, results.size());
        assertEquals("Apple", results.get(0).getOutput());
        assertEquals("Microsoft", results.get(1).getOutput());
    }

    @Test
    public void completeSearchOverlapping() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("Apple");
        tree.add("App");
        tree.add("Microsoft");
        tree.add("Mic");
        tree.prepare();

        String inputText = "Apple is better than Microsoft";
        List<OutputResult> results = tree.completeSearch(inputText, true, false);

        assertEquals(4, results.size());
        assertEquals("App", results.get(0).getOutput());
        assertEquals("Apple", results.get(1).getOutput());
        assertEquals("Mic", results.get(2).getOutput());
        assertEquals("Microsoft", results.get(3).getOutput());
    }

    @Test
    public void completeSearchTokenized1() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("Apple");
        tree.add("e i");
        tree.add("than Microsoft");
        tree.add("Microsoft");
        tree.add("er than");
        tree.prepare();

        String inputText = "Apple is better than Microsoft";
        List<OutputResult> results = tree.completeSearch(inputText, false, true); // without overlapping

        assertEquals(2, results.size());
        assertEquals("Apple", results.get(0).getOutput());
        assertEquals("than Microsoft", results.get(1).getOutput());

        results = tree.completeSearch(inputText, true, true); // with overlapping

        assertEquals(3, results.size());
        assertEquals("Apple", results.get(0).getOutput());
        assertEquals("than Microsoft", results.get(1).getOutput());
        assertEquals("Microsoft", results.get(2).getOutput());
    }

    @Test
    public void completeSearchTokenized2() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("Real Madrid");
        tree.add("Madrid");
        tree.add("Barcelona");
        tree.add("Messi");
        tree.add("esp");
        tree.add("o p");
        tree.add("Mes");
        tree.add("Rea");
        tree.prepare();

        String inputText = "El Real Madrid no puede fichar a Messi porque es del Barcelona";
        List<OutputResult> results = tree.completeSearch(inputText, false, true); // without overlapping

        assertEquals(3, results.size());
        assertEquals("Real Madrid", results.get(0).getOutput());
        assertEquals("Messi", results.get(1).getOutput());
        assertEquals("Barcelona", results.get(2).getOutput());
    }

    @Test
    public void completeSearchTokenized3() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("comp");
        tree.prepare();

        String inputText = "    A    complete      sentence     ";
        List<OutputResult> results = tree.completeSearch(inputText, false, true); // without overlapping

        assertEquals(0, results.size());
    }

    @Test
    public void completeSearchTokenized4() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("Madrid");
        tree.add("Real");
        tree.add("Real Madrid");
        tree.add("El Real de Espaa");
        tree.prepare();

        String inputText = "El Real Madrid no puede fichar a Messi porque es del Barcelona";
        List<OutputResult> results = tree.completeSearch(inputText, false, true); // without overlapping

        assertEquals(1, results.size());
        assertEquals("Real Madrid", results.get(0).getOutput());
    }

    @Test
    public void completeSearchTokenized5() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("Microsoft");
        tree.add("than Microsoft");
        tree.add("han Microsoft");
        tree.add("n Microsoft");
        tree.add(" Microsoft");
        tree.prepare();

        String inputText = "Apple is better than Microsoft";
        List<OutputResult> results = tree.completeSearch(inputText, true, true); // with overlapping

        assertEquals(2, results.size());
        assertEquals("than Microsoft", results.get(0).getOutput());
        assertEquals("Microsoft", results.get(1).getOutput());
    }

    @Test
    public void pathCompression() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("hello");
        tree.add("world");
        tree.prepare();

        State root = tree.getRoot();
        State hState = root.get('h');
        State wState = root.get('w');
        assertEquals("ello", hState.getFastPath());
        assertEquals("orld", wState.getFastPath());

        String inputText = "helloworl";
        List<OutputResult> results = tree.completeSearch(inputText, false, false);
        assertEquals(1, results.size());
        inputText = "helloworld";
        results = tree.completeSearch(inputText, false, false);
        assertEquals(2, results.size());
    }

    @Test
    public void fastpathEarlyTermination() {
        AhoCorasick tree = AhoCorasick.builder().build();
        tree.add("abcdefg");
        tree.prepare();
        List<OutputResult> results = tree.completeSearch("abcde", true, false);
        assertEquals(0, results.size());
    }

    private static final int MIN_LENGTH = 3;

    private static final int MAX_LENGTH = 20;

    private void randomOneIteration(int iter) {
        Random random = new Random();
        Set<String> elements = new LinkedHashSet<>();
        AhoCorasick tree = AhoCorasick.builder().build();
        for (int i = 0; i < 10; i++) {
            int length = random.nextInt(MAX_LENGTH - MIN_LENGTH) + MIN_LENGTH;
            String candidate = RandomStringUtils.random(length, 'a', 'b', 'c');
            elements.add(candidate);
            tree.add(candidate);
        }
        tree.prepare();
        StringBuilder builder = new StringBuilder();
        for (String element : elements) {
            builder.append(element);
        }
        String input = builder.toString();
        List<OutputResult> results = tree.completeSearch(input, true, false);
        Set<String> output = new HashSet<>();
        for (OutputResult outputResult : results) {
            output.add(outputResult.getOutput().toString());
        }
        if (elements.size() > output.size()) {
            Set<String> extra = new LinkedHashSet<>(elements);
            extra.removeAll(output);
            fail("At iteration " + iter + " did not find the elements: " + extra + " out of the input set: "
                    + elements);
        } else if (elements.size() < output.size()) {
            output.removeAll(elements);
            fail("At iteration " + iter + " + found extra elements: " + output);
        }
    }

    @Test
    public void randomIterations() {
        for (int i = 0; i < 1000; i++) {
            randomOneIteration(i);
        }
    }
}