Java tutorial
/** * Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.thirdeye.hadoop.aggregation; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.TreeMap; import java.util.concurrent.TimeUnit; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.avro.hadoop.io.AvroSerialization; import org.apache.avro.mapred.AvroKey; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mrunit.mapreduce.MapDriver; import org.apache.hadoop.mrunit.mapreduce.ReduceDriver; import org.apache.hadoop.mrunit.testutil.TemporaryPath; import org.apache.hadoop.mrunit.types.Pair; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfigProperties; import com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob.AggregationMapper; import com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob.AggregationReducer; /** * This tests mapper of Aggregation phase, to check conversion of time column to bucket time * This also tests reducer to check aggregation using new time values */ public class AggregationPhaseTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String HADOOP_IO_SERIALIZATION = "io.serializations"; private static final String AVRO_SCHEMA = "schema.avsc"; private String outputPath; private Schema inputSchema; private ThirdEyeConfig thirdeyeConfig; Properties props = new Properties(); private MapDriver<AvroKey<GenericRecord>, NullWritable, BytesWritable, BytesWritable> mapDriver; private ReduceDriver<BytesWritable, BytesWritable, AvroKey<GenericRecord>, NullWritable> reduceDriver; private void setUpAvroSerialization(Configuration conf, Schema inputSchema) { String[] currentSerializations = conf.getStrings(HADOOP_IO_SERIALIZATION); String[] finalSerializations = new String[currentSerializations.length + 1]; System.arraycopy(currentSerializations, 0, finalSerializations, 0, currentSerializations.length); finalSerializations[finalSerializations.length - 1] = AvroSerialization.class.getName(); mapDriver.getConfiguration().setStrings(HADOOP_IO_SERIALIZATION, finalSerializations); AvroSerialization.addToConfiguration(conf); AvroSerialization.setKeyWriterSchema(conf, inputSchema); AvroSerialization.setValueWriterSchema(conf, Schema.create(Schema.Type.NULL)); } private List<GenericRecord> generateTestMapperData() throws Exception { List<GenericRecord> inputRecords = new ArrayList<GenericRecord>(); // 2016-04-27T190000 GenericRecord input = new GenericData.Record(inputSchema); input.put("d1", "abc1"); input.put("d2", "pqr1"); input.put("d3", "xyz1"); input.put("hoursSinceEpoch", 1461808800000L); input.put("m1", 100); input.put("m2", 20); inputRecords.add(input); // 2016-04-27T191000 input = new GenericData.Record(inputSchema); input.put("d1", "abc1"); input.put("d2", "pqr1"); input.put("d3", "xyz1"); input.put("hoursSinceEpoch", 1461809400000L); input.put("m1", 100); input.put("m2", 20); inputRecords.add(input); // 2016-04-27T20 input = new GenericData.Record(inputSchema); input.put("d1", "abc2"); input.put("d2", "pqr2"); input.put("d3", "xyz2"); input.put("hoursSinceEpoch", 1461812400000L); input.put("m1", 10); input.put("m2", 2); inputRecords.add(input); return inputRecords; } private List<Pair<BytesWritable, List<BytesWritable>>> generateTestReduceData( List<Pair<BytesWritable, BytesWritable>> result) throws Exception { List<Pair<BytesWritable, List<BytesWritable>>> inputRecords = new ArrayList<>(); Map<BytesWritable, List<BytesWritable>> inputMap = new TreeMap<>(); for (Pair<BytesWritable, BytesWritable> pair : result) { inputMap.put(pair.getFirst(), new ArrayList<BytesWritable>()); } for (Pair<BytesWritable, BytesWritable> pair : result) { inputMap.get(pair.getFirst()).add(pair.getSecond()); } for (Entry<BytesWritable, List<BytesWritable>> listPair : inputMap.entrySet()) { inputRecords.add(new Pair<BytesWritable, List<BytesWritable>>(listPair.getKey(), listPair.getValue())); } return inputRecords; } @Before public void setUp() throws Exception { props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString(), "collection"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString(), "d1,d2,d3"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString(), "m1,m2"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), "INT,INT"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TIMECOLUMN_NAME.toString(), "hoursSinceEpoch"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_INPUT_TIMECOLUMN_SIZE.toString(), "1"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_INPUT_TIMECOLUMN_TYPE.toString(), TimeUnit.HOURS.toString()); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_INPUT_TIMECOLUMN_SIZE.toString(), "1"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_INPUT_TIMECOLUMN_TYPE.toString(), TimeUnit.MILLISECONDS.toString()); thirdeyeConfig = ThirdEyeConfig.fromProperties(props); // Mapper config AggregationMapper mapper = new AggregationMapper(); mapDriver = MapDriver.newMapDriver(mapper); Configuration configuration = mapDriver.getConfiguration(); configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); configuration.set(AggregationPhaseConstants.AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); inputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA)); setUpAvroSerialization(mapDriver.getConfiguration(), inputSchema); // Reducer config AggregationReducer reducer = new AggregationReducer(); reduceDriver = ReduceDriver.newReduceDriver(reducer); configuration = reduceDriver.getConfiguration(); configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Schema avroSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA)); configuration.set(AggregationPhaseConstants.AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString()); configuration.set(AggregationPhaseConstants.AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); TemporaryPath tmpPath = new TemporaryPath(); outputPath = tmpPath.toString(); configuration.set(AggregationPhaseConstants.AGG_PHASE_OUTPUT_PATH.toString(), outputPath); setUpAvroSerialization(reduceDriver.getConfiguration(), inputSchema); } @Test public void testAggregationPhase() throws Exception { int recordCount = 0; List<GenericRecord> inputRecords = generateTestMapperData(); for (GenericRecord record : inputRecords) { AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>(); inKey.datum(record); mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get())); recordCount++; } List<Pair<BytesWritable, BytesWritable>> mapResult = mapDriver.run(); Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount, mapResult.size()); AggregationPhaseMapOutputKey keyWrapper = AggregationPhaseMapOutputKey .fromBytes(mapResult.get(0).getFirst().getBytes()); Assert.assertEquals(406058, keyWrapper.getTime()); keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(1).getFirst().getBytes()); Assert.assertEquals(406058, keyWrapper.getTime()); keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(2).getFirst().getBytes()); Assert.assertEquals(406059, keyWrapper.getTime()); List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(mapResult); reduceDriver.addAll(reduceInput); List<Pair<AvroKey<GenericRecord>, NullWritable>> reduceResult = reduceDriver.run(); Assert.assertEquals("Incorrect number of records returned by aggregation reducer", 2, reduceResult.size()); GenericRecord record = reduceResult.get(0).getFirst().datum(); List<String> dimensionsExpected = Lists.newArrayList("abc1", "pqr1", "xyz1"); List<String> dimensionsActual = getDimensionsFromRecord(record); Assert.assertEquals(dimensionsExpected, dimensionsActual); List<Integer> metricsExpected = Lists.newArrayList(200, 40); List<Integer> metricsActual = getMetricsFromRecord(record); Assert.assertEquals(metricsExpected, metricsActual); Assert.assertEquals(406058, (long) record.get("hoursSinceEpoch")); record = reduceResult.get(1).getFirst().datum(); dimensionsExpected = Lists.newArrayList("abc2", "pqr2", "xyz2"); dimensionsActual = getDimensionsFromRecord(record); Assert.assertEquals(dimensionsExpected, dimensionsActual); metricsExpected = Lists.newArrayList(10, 2); metricsActual = getMetricsFromRecord(record); Assert.assertEquals(metricsExpected, metricsActual); Assert.assertEquals(406059, (long) record.get("hoursSinceEpoch")); } private List<String> getDimensionsFromRecord(GenericRecord record) { List<String> dimensionsActual = new ArrayList<>(); dimensionsActual.add((String) record.get("d1")); dimensionsActual.add((String) record.get("d2")); dimensionsActual.add((String) record.get("d3")); return dimensionsActual; } private List<Integer> getMetricsFromRecord(GenericRecord record) { List<Integer> metricsActual = new ArrayList<>(); metricsActual.add((int) record.get("m1")); metricsActual.add((int) record.get("m2")); return metricsActual; } @After public void cleanUp() throws IOException { File f = new File(outputPath); FileUtils.deleteDirectory(f); } }