Java tutorial
/** * Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.thirdeye.hadoop.topk; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Random; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.TimeUnit; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.avro.hadoop.io.AvroSerialization; import org.apache.avro.mapred.AvroKey; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mrunit.mapreduce.MapDriver; import org.apache.hadoop.mrunit.mapreduce.ReduceDriver; import org.apache.hadoop.mrunit.testutil.TemporaryPath; import org.apache.hadoop.mrunit.types.Pair; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfigProperties; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConstants; import com.linkedin.thirdeye.hadoop.topk.TopKPhaseJob.TopKPhaseMapper; import com.linkedin.thirdeye.hadoop.topk.TopKPhaseJob.TopKPhaseReducer; /** * This test will test mapper of Topk phase, * to ensure the right pairs being emitted * This will also test the topk file generated */ public class TopkPhaseTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String HADOOP_IO_SERIALIZATION = "io.serializations"; private static final String AVRO_SCHEMA = "schema.avsc"; private String outputPath; private Schema inputSchema; private ThirdEyeConfig thirdeyeConfig; Properties props = new Properties(); private MapDriver<AvroKey<GenericRecord>, NullWritable, BytesWritable, BytesWritable> mapDriver; private ReduceDriver<BytesWritable, BytesWritable, NullWritable, NullWritable> reduceDriver; private long generateRandomHoursSinceEpoch() { Random r = new Random(); // setting base value to year 2012 long unixtime = (long) (1293861599 + r.nextDouble() * 60 * 60 * 24 * 365); return TimeUnit.SECONDS.toHours(unixtime); } private void setUpAvroSerialization(Configuration conf, Schema inputSchema) { String[] currentSerializations = conf.getStrings(HADOOP_IO_SERIALIZATION); String[] finalSerializations = new String[currentSerializations.length + 1]; System.arraycopy(currentSerializations, 0, finalSerializations, 0, currentSerializations.length); finalSerializations[finalSerializations.length - 1] = AvroSerialization.class.getName(); mapDriver.getConfiguration().setStrings(HADOOP_IO_SERIALIZATION, finalSerializations); AvroSerialization.addToConfiguration(conf); AvroSerialization.setKeyWriterSchema(conf, inputSchema); AvroSerialization.setValueWriterSchema(conf, Schema.create(Schema.Type.NULL)); } private List<GenericRecord> generateTestMapperData() throws Exception { List<GenericRecord> inputRecords = new ArrayList<GenericRecord>(); GenericRecord input = new GenericData.Record(inputSchema); input.put("d1", "abc1"); input.put("d2", "pqr1"); input.put("d3", "xyz1"); input.put("hoursSinceEpoch", generateRandomHoursSinceEpoch()); input.put("m1", 100); input.put("m2", 20); inputRecords.add(input); input = new GenericData.Record(inputSchema); input.put("d1", "abc2"); input.put("d2", "pqr2"); input.put("d3", "xyz2"); input.put("hoursSinceEpoch", generateRandomHoursSinceEpoch()); input.put("m1", 10); input.put("m2", 20); inputRecords.add(input); return inputRecords; } private List<Pair<BytesWritable, List<BytesWritable>>> generateTestReduceData( List<Pair<BytesWritable, BytesWritable>> result) throws Exception { List<Pair<BytesWritable, List<BytesWritable>>> inputRecords = new ArrayList<>(); Map<BytesWritable, List<BytesWritable>> inputMap = new TreeMap<>(); for (Pair<BytesWritable, BytesWritable> pair : result) { inputMap.put(pair.getFirst(), Lists.newArrayList(pair.getSecond())); } for (Entry<BytesWritable, List<BytesWritable>> listPair : inputMap.entrySet()) { inputRecords.add(new Pair<BytesWritable, List<BytesWritable>>(listPair.getKey(), listPair.getValue())); } return inputRecords; } @Before public void setUp() throws Exception { props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString(), "collection"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString(), "d1,d2,d3"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString(), "m1,m2"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), "INT,INT"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TIMECOLUMN_NAME.toString(), "hoursSinceEpoch"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_DIMENSION_NAMES.toString(), "d2,"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_METRICS.toString() + ".d2", "m1"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_KVALUES.toString() + ".d2", "1"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_WHITELIST_DIMENSION_NAMES.toString(), "d3"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_WHITELIST_DIMENSION.toString() + ".d3", "xyz2"); thirdeyeConfig = ThirdEyeConfig.fromProperties(props); // Mapper config TopKPhaseMapper mapper = new TopKPhaseMapper(); mapDriver = MapDriver.newMapDriver(mapper); Configuration configuration = mapDriver.getConfiguration(); configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); configuration.set(TopKPhaseConstants.TOPK_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); inputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA)); setUpAvroSerialization(mapDriver.getConfiguration(), inputSchema); // Reducer config TopKPhaseReducer reducer = new TopKPhaseReducer(); reduceDriver = ReduceDriver.newReduceDriver(reducer); configuration = reduceDriver.getConfiguration(); configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); configuration.set(TopKPhaseConstants.TOPK_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); TemporaryPath tmpPath = new TemporaryPath(); outputPath = tmpPath.toString(); configuration.set(TopKPhaseConstants.TOPK_PHASE_OUTPUT_PATH.toString(), outputPath); } @Test public void testTopKColumnTransformationPhase() throws Exception { int recordCount = 0; List<GenericRecord> inputRecords = generateTestMapperData(); for (GenericRecord record : inputRecords) { AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>(); inKey.datum(record); mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get())); recordCount++; } List<Pair<BytesWritable, BytesWritable>> result = mapDriver.run(); // for each record, we emit 2 records per dimension: // once for actual value of dimension, once for ALL,ALL Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount * 3 * 2, result.size()); Map<String, Integer> counts = new HashMap<>(); for (Pair<BytesWritable, BytesWritable> pair : result) { TopKPhaseMapOutputKey key = TopKPhaseMapOutputKey.fromBytes(pair.getFirst().getBytes()); String dimensionName = key.getDimensionName(); Integer count = counts.get(dimensionName); if (count == null) { count = 0; } counts.put(dimensionName, count + 1); } Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d1")); Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d2")); Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d3")); Assert.assertEquals("Incorrect number of records emitted from map", 6, (int) counts.get("0")); List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(result); reduceDriver.addAll(reduceInput); reduceDriver.run(); File topKFile = new File(outputPath, ThirdEyeConstants.TOPK_VALUES_FILE); Assert.assertTrue("Topk file failed to generate!", topKFile.exists()); TopKDimensionValues topk = OBJECT_MAPPER.readValue(new FileInputStream(topKFile), TopKDimensionValues.class); Map<String, Set<String>> topkMap = topk.getTopKDimensions(); Assert.assertEquals("Incorrect topk object", topkMap.size(), 1); Assert.assertEquals("Incorrect topk values in topk object", Sets.newHashSet("pqr1"), topkMap.get("d2")); Assert.assertEquals("Incorrect whitelist values in topk object", null, topkMap.get("d3")); } @After public void cleanUp() throws IOException { File f = new File(outputPath); FileUtils.deleteDirectory(f); } }