Java tutorial
/* * Licensed to the Ted Dunning under one or more contributor license * agreements. See the NOTICE file that may be * distributed with this work for additional information * regarding copyright ownership. Ted Dunning licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.mapr.synth; import com.google.common.base.Function; import com.google.common.collect.*; import org.apache.commons.math3.distribution.NormalDistribution; import com.mapr.synth.distributions.LongTail; import com.mapr.synth.distributions.TermGenerator; import com.mapr.synth.distributions.WordGenerator; import org.apache.mahout.math.stats.LogLikelihood; import org.junit.Test; import java.util.List; import java.util.Random; import java.util.SortedSet; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class TermGeneratorTest { private static final WordGenerator WORDS = new WordGenerator("word-frequency-seed", "other-words"); @Test public void generateTerms() { TermGenerator x = new TermGenerator(WORDS, 1, 0.8); final Multiset<String> counts = HashMultiset.create(); for (int i = 0; i < 10000; i++) { counts.add(x.sample()); } assertEquals(10000, counts.size()); assertTrue("Should have some common words", counts.elementSet().size() < 10000); List<Integer> k = Lists .newArrayList(Iterables.transform(counts.elementSet(), new Function<String, Integer>() { public Integer apply(String s) { return counts.count(s); } })); // System.out.printf("%s\n", Ordering.natural().reverse().sortedCopy(k).subList(0, 30)); // System.out.printf("%s\n", Iterables.transform(Iterables.filter(counts.elementSet(), new Predicate<String>() { // public boolean apply(String s) { // return counts.count(s) > 100; // } // }), new Function<String, String>() { // public String apply(String s) { // return s + ":" + counts.count(s); // } // })); assertEquals(1, Ordering.natural().leastOf(k, 1).get(0).intValue()); assertTrue(Ordering.natural().greatestOf(k, 1).get(0) > 300); assertTrue(counts.count("the") > 300); } @Test public void distinctVocabularies() { TermGenerator x1 = new TermGenerator(WORDS, 1, 0.8); final Multiset<String> k1 = HashMultiset.create(); for (int i = 0; i < 50000; i++) { k1.add(x1.sample()); } TermGenerator x2 = new TermGenerator(WORDS, 1, 0.8); final Multiset<String> k2 = HashMultiset.create(); for (int i = 0; i < 50000; i++) { k2.add(x2.sample()); } final NormalDistribution normal = new NormalDistribution(); List<Double> scores = Ordering.natural() .sortedCopy(Iterables.transform(k1.elementSet(), new Function<String, Double>() { public Double apply(String s) { return normal.cumulativeProbability(LogLikelihood.rootLogLikelihoodRatio(k1.count(s), 50000 - k1.count(s), k2.count(s), 50000 - k2.count(s))); } })); int n = scores.size(); // System.out.printf("%.5f, %.5f, %.5f, %.5f, %.5f, %.5f, %.5f", scores.get(0), scores.get((int) (0.05*n)), scores.get(n / 4), scores.get(n / 2), scores.get(3 * n / 4), scores.get((int) (0.95 * n)), scores.get(n - 1)); int i = 0; for (Double score : scores) { if (i % 10 == 0) { System.out.printf("%.6f\t%.6f\n", (double) i / n, score); } i++; } } @Test public void speciesCounts() { final boolean transpose = false; // generate an example of species sampled on multiple days LongTail<Integer> terms = new LongTail<Integer>(0.5, 0.3) { int max = 0; @Override protected Integer createThing() { return ++max; } }; // I picked seeds to get a good illustration ... want a reasonable number of species and surprises terms.setSeed(2); Random gen = new Random(1); SortedSet<Integer> vocabulary = Sets.newTreeSet(); List<Multiset<Integer>> r = Lists.newArrayList(); for (int i = 0; i < 2000; i++) { double length = Math.rint(gen.nextGaussian() * 10 + 50); Multiset<Integer> counts = HashMultiset.create(); for (int j = 0; j < length; j++) { counts.add(terms.sample()); } r.add(counts); } if (transpose) { for (Multiset<Integer> day : r) { vocabulary.addAll(day.elementSet()); } System.out.printf("%d\n", vocabulary.size()); for (Integer s : vocabulary) { String sep = ""; for (Multiset<Integer> day : r) { System.out.printf("%s%s", sep, day.count(s)); sep = "\t"; } System.out.printf("\n"); } } else { System.out.printf("%d\n", vocabulary.size()); for (Multiset<Integer> day : r) { vocabulary.addAll(day.elementSet()); String sep = ""; System.out.printf("%s%s", sep, vocabulary.size()); sep = "\t"; for (Integer s : vocabulary) { System.out.printf("%s%s", sep, day.count(s)); sep = "\t"; } System.out.printf("\n"); } Multiset<Integer> total = HashMultiset.create(); for (Multiset<Integer> day : r) { for (Integer species : day.elementSet()) { total.add(species, day.count(species)); } } String sep = ""; System.out.printf("%s%s", sep, total.elementSet().size()); sep = "\t"; for (Integer s : vocabulary) { System.out.printf("%s%s", sep, total.count(s)); sep = "\t"; } System.out.printf("\n"); } } }