Example usage for org.apache.hadoop.util.bloom Key Key

List of usage examples for org.apache.hadoop.util.bloom Key Key

Introduction

In this page you can find the example usage for org.apache.hadoop.util.bloom Key Key.

Prototype

public Key(byte[] value) 

Source Link

Document

Constructor.

Usage

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesBetweenSets.java

License:Apache License

static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException {
    AccumuloBackedGraph graph = setupGraph();

    Set<TypeValue> seeds = new HashSet<TypeValue>();
    seeds.add(new TypeValue("customer", "A0"));
    seeds.add(new TypeValue("customer", "A23"));
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new TypeValue("abc", "abc" + i));
    }//from w  w  w. ja  v a  2s .  co  m

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        graph.setMaxEntriesForBatchScanner(20);
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER);

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (TypeValue entity : seeds) {
        filter.add(new Key(entity.getValue().getBytes()));
    }

    // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER;
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true,
            visibilityString1, sevenDaysBefore, sixDaysBefore);
    SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000));
    graph.addGraphElementsWithStatistics(
            Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics)));

    // Now query for all edges in set - shouldn't get the false positive
    CloseableIterable<GraphElementWithStatistics> retriever = graph.getGraphElementsWithStatisticsBetweenSets(
            Collections.singleton(new TypeValue("customer", "A0")), seeds, loadIntoMemory);
    Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>();
    for (GraphElementWithStatistics gews : retriever) {
        results.add(gews);
    }
    retriever.close();

    // Check results are as expected
    Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>();
    GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase",
            "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1));
    GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000));
    expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2));
    assertEquals(expectedResults, results);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java

License:Apache License

static void testDealWithFalsePositives(boolean loadIntoMemory) throws GraphAccessException {
    AccumuloBackedGraph graph = setupGraph();

    // Query for all edges in set {customer|A0, customer|A23}
    Set<TypeValue> seeds = new HashSet<TypeValue>();
    seeds.add(new TypeValue("customer", "A0"));
    seeds.add(new TypeValue("customer", "A23"));
    // Add a bunch of items that are not in the data to make the probability of being able to find a false
    // positive sensible.
    for (int i = 0; i < 10; i++) {
        seeds.add(new TypeValue("abc", "abc" + i));
    }/*from ww w  . j  a v  a 2s. co m*/

    // Need to make sure that the Bloom filter we create has the same size and the same number of hashes as the
    // one that GraphElementsWithStatisticsWithinSetRetriever creates.
    int numItemsToBeAdded = loadIntoMemory ? seeds.size() : 20;
    if (!loadIntoMemory) {
        graph.setMaxEntriesForBatchScanner(20);
    }

    // Find something that will give a false positive
    // Need to repeat the logic used in the getGraphElementsWithStatisticsWithinSet() method.
    // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
    // maxBloomFilterToPassToAnIterator bytes.
    int size = (int) (-numItemsToBeAdded * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
    size = Math.min(size, Constants.MAX_SIZE_BLOOM_FILTER);

    // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
    // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
    int numHashes = Math.max(1, (int) ((size / numItemsToBeAdded) * Math.log(2)));
    // Create Bloom filter and add seeds to it
    BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
    for (TypeValue entity : seeds) {
        filter.add(new Key(entity.getValue().getBytes()));
    }

    // Test random items against it - should only have to test MAX_SIZE_BLOOM_FILTER / 2 on average before find a
    // false positive (but impose an arbitrary limit to avoid an infinite loop if there's a problem).
    int count = 0;
    int maxNumberOfTries = 50 * Constants.MAX_SIZE_BLOOM_FILTER;
    while (count < maxNumberOfTries) {
        count++;
        if (filter.membershipTest(new Key(("" + count).getBytes()))) {
            break;
        }
    }
    if (count == maxNumberOfTries) {
        fail("Didn't find a false positive");
    }

    // False positive is "" + count so create an edge from seeds to that
    Edge edge = new Edge("customer", "A0", "customer", "" + count, "purchase", "instore", true,
            visibilityString1, sevenDaysBefore, sixDaysBefore);
    SetOfStatistics statistics = new SetOfStatistics("count", new Count(1000000));
    graph.addGraphElementsWithStatistics(
            Collections.singleton(new GraphElementWithStatistics(new GraphElement(edge), statistics)));

    // Now query for all edges in set - shouldn't get the false positive
    CloseableIterable<GraphElementWithStatistics> retriever = graph
            .getGraphElementsWithStatisticsWithinSet(seeds, loadIntoMemory);
    Set<GraphElementWithStatistics> results = new HashSet<GraphElementWithStatistics>();
    for (GraphElementWithStatistics gews : retriever) {
        results.add(gews);
    }
    retriever.close();

    // Check results are as expected
    Set<GraphElementWithStatistics> expectedResults = new HashSet<GraphElementWithStatistics>();
    GraphElement expectedElement1 = new GraphElement(new Edge("customer", "A0", "customer", "A23", "purchase",
            "instore", true, visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics1 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement1, expectedStatistics1));
    GraphElement expectedElement2 = new GraphElement(new Entity("customer", "A0", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics2 = new SetOfStatistics("count", new Count(10000));
    expectedResults.add(new GraphElementWithStatistics(expectedElement2, expectedStatistics2));
    GraphElement expectedElement3 = new GraphElement(new Entity("customer", "A23", "purchase", "instore",
            visibilityString1, sevenDaysBefore, sixDaysBefore));
    SetOfStatistics expectedStatistics3 = new SetOfStatistics("count", new Count(23));
    expectedResults.add(new GraphElementWithStatistics(expectedElement3, expectedStatistics3));
    assertEquals(expectedResults, results);
}

From source file:gaffer.accumulo.TestAccumuloBackedGraphGetEdgesInSet.java

License:Apache License

/**
 * Used to sanity check the calculation used in {@link BloomFilterUtilities}'s <code>calculateBloomFilterSize()</code>
 * and <code>calculateNumHashes()</code> methods.
 *
 * @param args  No arguments necessary/*from   ww  w.ja v  a 2  s .co m*/
 */
public static void main(String[] args) {
    int[] numItems = new int[] { 10, 100, 1000, 10000, 100000 };
    for (int num : numItems) {
        System.out.println("Num items to add = " + num);
        // Calculate sensible size of filter, aiming for false positive rate of 1 in 10000, with a maximum size of
        // 1MB.
        int size = (int) (-num * Math.log(0.0001) / (Math.pow(Math.log(2.0), 2.0)));
        size = Math.min(size, 1024 * 1024);
        System.out.println("Size = " + size + " bits (=" + (size / 8) + " bytes)");
        // Work out optimal number of hashes to use in Bloom filter based on size of set - optimal number of hashes is
        // (m/n)ln 2 where m is the size of the filter in bits and n is the number of items that will be added to the set.
        int numHashes = Math.max(1, (int) ((size / num) * Math.log(2)));
        System.out.println("Num hashes = " + numHashes);
        // Create Bloom filter
        BloomFilter filter = new BloomFilter(size, numHashes, Hash.MURMUR_HASH);
        // Add num items to it
        for (int i = 0; i < num; i++) {
            filter.add(new Key(("" + i).getBytes()));
        }
        // Theoretical probability of false positive is (1 - e^(-kn/m)) ^ k (as long as size hasn't been
        // truncated to 1MB).
        System.out.println("Theoretical probability of false positive = "
                + Math.pow(1.0 - Math.exp(-(double) numHashes * num / (double) size), numHashes));
        // Test false positive rate - should be approx 1 in 10000
        int numPass = 0;
        for (int i = num; i < 1000000; i++) {
            if (filter.membershipTest(new Key(("" + i).getBytes()))) {
                numPass++;
            }
        }
        System.out.println("Measured probability of false positive " + (numPass / 1000000.0));
    }
}

From source file:gaffer.accumulostore.test.bloom.FilterWritabilityTest.java

License:Apache License

@Test
public void shouldAcceptValidFilter() {
    // Given/*  w  w  w  .  j  av a  2  s . com*/
    final BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));

    // Then
    assertTrue(filter.membershipTest(new Key("ABC".getBytes())));
    assertTrue(filter.membershipTest(new Key("DEF".getBytes())));
    assertFalse(filter.membershipTest(new Key("lkjhgfdsa".getBytes())));
}

From source file:gaffer.accumulostore.test.bloom.FilterWritabilityTest.java

License:Apache License

@Test
public void shouldWriteAndReadFilter() throws IOException {
    // Given//from  w  w  w .ja v  a2 s  .c o m
    final BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    final DataOutputStream out = new DataOutputStream(baos);
    filter.write(out);
    String x = new String(baos.toByteArray(), AccumuloStoreConstants.BLOOM_FILTER_CHARSET);
    final ByteArrayInputStream bais = new ByteArrayInputStream(
            x.getBytes(AccumuloStoreConstants.BLOOM_FILTER_CHARSET));

    // When
    final DataInputStream in = new DataInputStream(bais);
    final BloomFilter read = new BloomFilter();
    read.readFields(in);

    // Then
    assertTrue(read.membershipTest(new Key("ABC".getBytes())));
    assertTrue(read.membershipTest(new Key("DEF".getBytes())));
    assertFalse(read.membershipTest(new Key("lkjhgfdsa".getBytes())));
}

From source file:gaffer.accumulostore.test.bloom.TestFilterWritability.java

License:Apache License

@Test
public void testAccept() {
    BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));
    assertTrue(filter.membershipTest(new Key("ABC".getBytes())));
    assertTrue(filter.membershipTest(new Key("DEF".getBytes())));
    assertFalse(filter.membershipTest(new Key("lkjhgfdsa".getBytes())));
}

From source file:gaffer.accumulostore.test.bloom.TestFilterWritability.java

License:Apache License

@Test
public void testWriteRead() throws IOException {
    BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    DataOutputStream out = new DataOutputStream(baos);
    filter.write(out);/*  w w  w.  j  a  v  a 2s.  co m*/
    String x = new String(baos.toByteArray(), AccumuloStoreConstants.BLOOM_FILTER_CHARSET);
    ByteArrayInputStream bais = new ByteArrayInputStream(
            x.getBytes(AccumuloStoreConstants.BLOOM_FILTER_CHARSET));
    DataInputStream in = new DataInputStream(bais);
    BloomFilter read = new BloomFilter();
    read.readFields(in);
    assertTrue(read.membershipTest(new Key("ABC".getBytes())));
    assertTrue(read.membershipTest(new Key("DEF".getBytes())));
    assertFalse(read.membershipTest(new Key("lkjhgfdsa".getBytes())));
}

From source file:gaffer.predicate.typevalue.impl.TestValueInBloomFilterPredicate.java

License:Apache License

@Test
public void testAccept() {
    BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));
    ValueInBloomFilterPredicate predicate = new ValueInBloomFilterPredicate(filter);
    assertTrue(predicate.accept("X", "ABC"));
    assertTrue(predicate.accept("X", "DEF"));
    assertFalse(predicate.accept("Y", "lkjhgfdsa"));
}

From source file:gaffer.predicate.typevalue.impl.TestValueInBloomFilterPredicate.java

License:Apache License

@Test
public void testWriteRead() throws IOException {
    BloomFilter filter = new BloomFilter(100, 5, Hash.MURMUR_HASH);
    filter.add(new Key("ABC".getBytes()));
    filter.add(new Key("DEF".getBytes()));
    ValueInBloomFilterPredicate predicate = new ValueInBloomFilterPredicate(filter);

    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    DataOutputStream out = new DataOutputStream(baos);
    predicate.write(out);/*w  w w  . j  ava 2s  .  c  o m*/
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    DataInputStream in = new DataInputStream(bais);
    ValueInBloomFilterPredicate read = new ValueInBloomFilterPredicate();
    read.readFields(in);

    assertTrue(read.accept("X", "ABC"));
    assertTrue(read.accept("X", "DEF"));
    assertFalse(read.accept("Y", "lkjhgfdsa"));
}

From source file:gaffer.predicate.typevalue.impl.ValueInBloomFilterPredicate.java

License:Apache License

@Override
public boolean accept(String type, String value) {
    return filter.membershipTest(new Key(value.getBytes()));
}