Java tutorial
/* * Licensed to the Ted Dunning under one or more contributor license * agreements. See the NOTICE file that may be * distributed with this work for additional information * regarding copyright ownership. Ted Dunning licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.mapr.synth; import com.fasterxml.jackson.databind.JsonNode; import com.google.common.base.Charsets; import com.google.common.collect.*; import com.google.common.io.Resources; import com.mapr.synth.samplers.SchemaSampler; import com.mapr.synth.samplers.StringSampler; import org.apache.mahout.math.stats.OnlineSummarizer; import org.junit.Assert; import org.junit.Test; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Iterator; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import static org.junit.Assert.*; public class SchemaSamplerTest { @Test public void testFieldNames() throws IOException { SchemaSampler s = new SchemaSampler( "[{\"name\":\"id\", \"class\":\"id\"}, {\"name\":\"foo\", \"class\":\"address\"}, {\"name\":\"bar\", \"class\":\"date\", \"format\":\"yy-MM-dd\"}, {\"name\":\"baz\", \"class\":\"foreign-key\", \"size\":1000, \"skew\":1}]"); assertEquals("[id, foo, bar, baz]", Iterables.toString(s.getFieldNames())); System.out.printf("%s\n", Iterables.toString(s.sample())); System.out.printf("%s\n", Iterables.toString(s.sample())); System.out.printf("%s\n", Iterables.toString(s.sample())); System.out.printf("%s\n", Iterables.toString(s.sample())); System.out.printf("%s\n", Iterables.toString(s.sample())); } @Test public void testInt() throws IOException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema001.json"), Charsets.UTF_8).read()); Multiset<String> counts = HashMultiset.create(); for (int i = 0; i < 10000; i++) { counts.add(s.sample().get("size").asText()); } for (int i = 10; i < 99; i++) { Assert.assertTrue(counts.elementSet().contains(i + "")); } assertEquals(99 - 10, counts.elementSet().size()); } @Test public void testString() throws IOException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema002.json"), Charsets.UTF_8).read()); Multiset<String> counts = HashMultiset.create(); double n = 10000; for (int i = 0; i < n; i++) { counts.add(s.sample().get("foo").asText()); } check(counts, 0.95 / 2, "YES"); check(counts, 0.05 / 2, "NO"); check(counts, 1.00 / 2, "NA"); } private void check(Multiset<String> counts, double p, String s) { double n = counts.size(); assertEquals(p, counts.count(s) / n, Math.sqrt(n * p * (n - p))); } @Test public void testSeveral() throws IOException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema003.json"), Charsets.UTF_8).read()); Multiset<String> gender = HashMultiset.create(); Pattern namePattern = Pattern.compile("[A-Z][a-z]+ [A-Z][a-z]+"); Pattern addressPattern = Pattern.compile("[0-9]+ [A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+"); Pattern datePattern1 = Pattern.compile("[01][0-9]/[0123][0-9]/20[012][0-9]"); Pattern datePattern2 = Pattern.compile("2014-0[12]-[0123][0-9]"); Pattern datePattern3 = Pattern.compile("[01][0-9]/[0123][0-9]/199[5-9]"); for (int i = 0; i < 10000; i++) { JsonNode record = s.sample(); assertEquals(i, record.get("id").asInt()); assertTrue(namePattern.matcher(record.get("name").asText()).matches()); assertTrue(addressPattern.matcher(record.get("address").asText()).matches()); assertTrue(datePattern1.matcher(record.get("first_visit").asText()).matches()); assertTrue(datePattern2.matcher(record.get("second_date").asText()).matches()); assertTrue(datePattern3.matcher(record.get("third_date").asText()).matches()); gender.add(record.get("gender").asText()); } check(gender, 0.5 * (1 - 0.02), "MALE"); check(gender, 0.5 * (1 - 0.02), "FEMALE"); check(gender, 0.02 * (1 - 0.02), "OTHER"); } @Test public void testMisc() throws IOException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema004.json"), Charsets.UTF_8).read()); Multiset<String> country = HashMultiset.create(); Multiset<String> language = HashMultiset.create(); Multiset<String> browser = HashMultiset.create(); Multiset<String> state = HashMultiset.create(); Multiset<String> os = HashMultiset.create(); for (int i = 0; i < 10000; i++) { JsonNode record = s.sample(); country.add(record.get("co").asText()); browser.add(record.get("br").asText()); language.add(record.get("la").asText()); state.add(record.get("st").asText()); os.add(record.get("os").asText()); } assertEquals(2542.0, country.count("us"), 200); assertEquals(3756.0, browser.count("Chrome"), 200); assertEquals(3256.0, language.count("en"), 200); assertEquals(1211.8, state.count("ca"), 100); assertEquals(5876.0, os.count("win7"), 120); } @Test public void testSequence() throws IOException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema005.json"), Charsets.UTF_8).read()); OnlineSummarizer s0 = new OnlineSummarizer(); OnlineSummarizer s1 = new OnlineSummarizer(); for (int i = 0; i < 10000; i++) { JsonNode x = s.sample(); s0.add(Iterables.size(x.get("c"))); s1.add(Iterables.size(x.get("d"))); for (JsonNode n : x.get("d")) { int z = n.asInt(); assertTrue(z >= 3 && z < 9); } } assertEquals(5, s0.getMean(), 1); assertEquals(10, s1.getMean(), 2); } @Test public void testSequenceArray() throws IOException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema006.json"), Charsets.UTF_8).read()); for (int i = 0; i < 10; i++) { JsonNode x = s.sample(); Iterator<JsonNode> values = x.get("x").elements(); assertEquals(3, values.next().asInt()); assertEquals(6, values.next().asInt()); assertEquals(8, values.next().asInt()); assertFalse(values.hasNext()); } } @Test public void testMap() throws IOException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema011.json"), Charsets.UTF_8).read()); for (int i = 0; i < 100; i++) { JsonNode x = s.sample(); assertEquals(i, x.get("id").asInt()); int v = x.get("stuff").get("a").asInt(); assertTrue(v == 3 || v == 4); v = x.get("stuff").get("b").asInt(); assertTrue(v == 4 || v == 5); } } @Test public void testSkewedInteger() throws IOException { // will give fields x, y, z, q with different skewness SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema007.json"), Charsets.UTF_8).read()); SortedMultiset<Integer> x = TreeMultiset.create(); SortedMultiset<Integer> y = TreeMultiset.create(); SortedMultiset<Integer> z = TreeMultiset.create(); SortedMultiset<Integer> q = TreeMultiset.create(); for (int i = 0; i < 10000; i++) { JsonNode record = s.sample(); x.add(record.get("x").asInt()); y.add(record.get("y").asInt()); z.add(record.get("z").asInt()); q.add(record.get("q").asInt()); } for (int i = 10; i < 20; i++) { assertEquals(1000, x.count(i), 100); assertEquals(1900 - (i - 10) * 200, y.count(i), 120); assertEquals(100 + (i - 10) * 200, z.count(i), 120); // these magic numbers are a fit to the empirical distribution of q as computed by R double kq = 122623.551282 - 27404.139083 * i + 2296.601107 * i * i - 85.510684 * i * i * i + 1.193182 * i * i * i * i; // accuracy should get better for smaller numbers assertEquals(kq, q.count(i), (25.0 - i) / 10 * 120); } } @Test public void testFileSampler() throws IOException { File f = new File("numbers.tsv"); f.deleteOnExit(); BufferedWriter out = Files.newBufferedWriter(f.toPath(), Charsets.UTF_8); out.write("a\tb\n"); for (int i = 0; i < 20; i++) { out.write(i + "\t" + (i * i) + "\n"); } out.close(); SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema008.json"), Charsets.UTF_8).read()); for (int k = 0; k < 1000; k++) { JsonNode r = s.sample(); assertEquals(6, r.get("x").get("x").asInt() + r.get("x").get("y").asInt()); int i = r.get("y").get("a").asInt(); assertEquals(i * i, r.get("y").get("b").asInt()); } } @Test public void testJoin() throws IOException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema009.json"), Charsets.UTF_8).read()); for (int k = 0; k < 10; k++) { JsonNode r = s.sample(); assertEquals("3,6,8", r.get("x").asText()); assertTrue(r.get("y").asInt() >= 1 && r.get("y").asInt() < 5); assertTrue(r.get("z").asText().matches("(xyz(,xyz)*)?")); } } @Test public void testEvents() throws IOException, ParseException { SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema012.json"), Charsets.UTF_8).read()); long t = System.currentTimeMillis(); SimpleDateFormat df0 = new SimpleDateFormat("yyyy-MM-dd"); SimpleDateFormat df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); SimpleDateFormat df2 = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss"); JsonNode old = s.sample(); long old1 = df0.parse(old.get("foo1").asText()).getTime(); assertTrue(Math.abs(old1 - t) < TimeUnit.MILLISECONDS.convert(1, TimeUnit.DAYS)); long old2 = df1.parse(old.get("foo2").asText()).getTime(); assertEquals((double) old2, df1.parse("2014-01-01 00:00:00").getTime(), 10.0); long old3 = df2.parse(old.get("foo3").asText()).getTime(); assertEquals(old3, df1.parse("2014-02-01 00:00:00").getTime(), 10); double sum1 = 0; double sum2 = 0; double sum3 = 0; final int N = 10000; for (int k = 0; k < N; k++) { JsonNode r = s.sample(); long t1 = df0.parse(r.get("foo1").asText()).getTime(); sum1 += t1 - old1; old1 = t1; long t2 = df1.parse(r.get("foo2").asText()).getTime(); sum2 += t2 - old2; old2 = t2; long t3 = df2.parse(r.get("foo3").asText()).getTime(); sum3 += t3 - old3; old3 = t3; } assertEquals((double) TimeUnit.MILLISECONDS.convert(10, TimeUnit.DAYS), (sum1 / N), 0.03 * (sum1 / N)); assertEquals(100, sum2 / N, 3); assertEquals(2000, sum3 / N, 2000 * 0.03); } public static class StringSamplerTest { @Test public void testEmptyDist() { StringSampler s = new StringSampler(); try { s.setDist(new HashMap<String, Object>()); fail("Should have detected empty distribution"); } catch (IllegalArgumentException e) { // whew ... that's what we wanted } } @Test public void testSimple() { StringSampler s = new StringSampler(); s.setDist(ImmutableMap.of("a", "3", "b", 5, "c", 1.0)); Multiset<String> counts = HashMultiset.create(); for (int i = 0; i < 1000; i++) { counts.add(s.sample().asText()); } assertEquals(3, counts.elementSet().size()); check(counts, "a", 1000 * 3.0 / 9.0); check(counts, "b", 1000 * 5.0 / 9.0); check(counts, "c", 1000 * 1.0 / 9.0); } private void check(Multiset<String> counts, String a, double n) { assertEquals(n, counts.count(a), 3 * Math.sqrt(n)); } } }