Java tutorial
/** * Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.thirdeye.hadoop.derivedcolumn.transformation; import java.io.DataInput; import java.io.DataInputStream; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Random; import java.util.Set; import java.util.concurrent.TimeUnit; import junit.framework.Assert; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericRecord; import org.apache.avro.hadoop.io.AvroSerialization; import org.apache.avro.mapred.AvroKey; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mrunit.mapreduce.MapDriver; import org.apache.hadoop.mrunit.testutil.TemporaryPath; import org.apache.hadoop.mrunit.types.Pair; import org.junit.After; import org.junit.Before; import org.junit.Test; import com.fasterxml.jackson.databind.ObjectMapper; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfigProperties; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConstants; import com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationPhaseConstants; import com.linkedin.thirdeye.hadoop.topk.TopKDimensionValues; import com.linkedin.thirdeye.hadoop.util.ThirdeyeAvroUtils; /** * This test will test mapper of DerivedColumnTransformation phase, * to see if new columns have been added according to the topk values file */ public class DerivedColumnTransformationTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String HADOOP_IO_SERIALIZATION = "io.serializations"; private static final String AVRO_SCHEMA = "schema.avsc"; private static final String TRANSFORMATION_SCHEMA = "transformation_schema.avsc"; private static final String TOPK_PATH = "topk_path"; private String outputPath; Properties props = new Properties(); private MapDriver<AvroKey<GenericRecord>, NullWritable, AvroKey<GenericRecord>, NullWritable> mapDriver; private long generateRandomHoursSinceEpoch() { Random r = new Random(); // setting base value to year 2012 long unixtime = (long) (1293861599 + r.nextDouble() * 60 * 60 * 24 * 365); return TimeUnit.SECONDS.toHours(unixtime); } private void setUpAvroSerialization(Configuration conf, Schema inputSchema) { String[] currentSerializations = conf.getStrings(HADOOP_IO_SERIALIZATION); String[] finalSerializations = new String[currentSerializations.length + 1]; System.arraycopy(currentSerializations, 0, finalSerializations, 0, currentSerializations.length); finalSerializations[finalSerializations.length - 1] = AvroSerialization.class.getName(); mapDriver.getConfiguration().setStrings(HADOOP_IO_SERIALIZATION, finalSerializations); AvroSerialization.addToConfiguration(conf); AvroSerialization.setKeyWriterSchema(conf, inputSchema); AvroSerialization.setValueWriterSchema(conf, Schema.create(Schema.Type.NULL)); } private void resetAvroSerialization() throws IOException { Configuration conf = mapDriver.getConfiguration(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Schema outputSchema = new Schema.Parser() .parse(ClassLoader.getSystemResourceAsStream(TRANSFORMATION_SCHEMA)); String[] currentSerializations = conf.getStrings(HADOOP_IO_SERIALIZATION); String[] finalSerializations = new String[currentSerializations.length + 1]; System.arraycopy(currentSerializations, 0, finalSerializations, 0, currentSerializations.length); finalSerializations[finalSerializations.length - 1] = AvroSerialization.class.getName(); mapDriver.getConfiguration().setStrings(HADOOP_IO_SERIALIZATION, finalSerializations); AvroSerialization.addToConfiguration(conf); AvroSerialization.setKeyWriterSchema(conf, outputSchema); AvroSerialization.setValueWriterSchema(conf, Schema.create(Schema.Type.NULL)); } private List<GenericRecord> generateTestData() throws Exception { Schema schema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA)); List<GenericRecord> inputRecords = new ArrayList<GenericRecord>(); GenericRecord input = new GenericData.Record(schema); input.put("d1", "abc1"); input.put("d2", "pqr1"); input.put("d3", "xyz1"); input.put("hoursSinceEpoch", generateRandomHoursSinceEpoch()); input.put("m1", 10); input.put("m2", 20); inputRecords.add(input); input = new GenericData.Record(schema); input.put("d1", "abc2"); input.put("d2", "pqr2"); input.put("d3", "xyz2"); input.put("hoursSinceEpoch", generateRandomHoursSinceEpoch()); input.put("m1", 10); input.put("m2", 20); inputRecords.add(input); return inputRecords; } @Before public void setUp() throws Exception { DerivedColumnTransformationPhaseMapper mapper = new DerivedColumnTransformationPhaseMapper(); mapDriver = MapDriver.newMapDriver(mapper); Configuration configuration = mapDriver.getConfiguration(); configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString(), "collection"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString(), "d1,d2,d3"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString(), "m1,m2"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), "INT,INT"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TIMECOLUMN_NAME.toString(), "hoursSinceEpoch"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_DIMENSION_NAMES.toString(), "d2,"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_METRICS.toString() + ".d2", "m1"); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_KVALUES.toString() + ".d2", "1"); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); configuration .set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG .toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); Schema inputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA)); setUpAvroSerialization(mapDriver.getConfiguration(), inputSchema); Schema outputSchema = new Schema.Parser() .parse(ClassLoader.getSystemResourceAsStream(TRANSFORMATION_SCHEMA)); configuration .set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA .toString(), outputSchema.toString()); configuration.set( DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH.toString(), ClassLoader.getSystemResource(TOPK_PATH).toString()); TemporaryPath tmpPath = new TemporaryPath(); outputPath = tmpPath.toString(); configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH .toString(), outputPath); } @Test public void testTopKColumnTransformationPhase() throws Exception { int recordCount = 0; List<GenericRecord> inputRecords = generateTestData(); for (GenericRecord record : inputRecords) { AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>(); inKey.datum(record); mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get())); recordCount++; } resetAvroSerialization(); List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run(); Assert.assertEquals(recordCount, result.size()); for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) { GenericRecord datum = pair.getFirst().datum(); Assert.assertEquals("TopKTransformationJob did not add new column for topk column", datum.getSchema().getField("d2_topk") != null, true); String d2 = (String) datum.get("d2"); String d2_topk = (String) datum.get("d2_topk"); Assert.assertEquals("Incorrect topk column transformation", (d2_topk.equals("other") && d2.equals("pqr1")) || (d2_topk.equals("pqr2") && d2.equals("pqr2")), true); } } @After public void cleanUp() throws IOException { File f = new File(outputPath); FileUtils.deleteDirectory(f); } public static class DerivedColumnTransformationPhaseMapper extends Mapper<AvroKey<GenericRecord>, NullWritable, AvroKey<GenericRecord>, NullWritable> { private Schema outputSchema; private ThirdEyeConfig thirdeyeConfig; private DerivedColumnTransformationPhaseConfig config; private List<String> dimensionsNames; private List<String> metricNames; private TopKDimensionValues topKDimensionValues; private Map<String, Set<String>> topKDimensionsMap; private String timeColumnName; private Map<String, Set<String>> whitelist; @Override public void setup(Context context) throws IOException, InterruptedException { Configuration configuration = context.getConfiguration(); FileSystem fs = FileSystem.get(configuration); thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get( DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG .toString()), ThirdEyeConfig.class); config = DerivedColumnTransformationPhaseConfig.fromThirdEyeConfig(thirdeyeConfig); dimensionsNames = config.getDimensionNames(); metricNames = config.getMetricNames(); timeColumnName = config.getTimeColumnName(); whitelist = config.getWhitelist(); outputSchema = new Schema.Parser().parse(configuration .get(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA .toString())); Path topKPath = new Path(configuration .get(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH .toString()) + File.separator + ThirdEyeConstants.TOPK_VALUES_FILE); topKDimensionValues = new TopKDimensionValues(); if (fs.exists(topKPath)) { FSDataInputStream topkValuesStream = fs.open(topKPath); topKDimensionValues = OBJECT_MAPPER.readValue((DataInput) topkValuesStream, TopKDimensionValues.class); topkValuesStream.close(); } topKDimensionsMap = topKDimensionValues.getTopKDimensions(); } @Override public void map(AvroKey<GenericRecord> key, NullWritable value, Context context) throws IOException, InterruptedException { // input record GenericRecord inputRecord = key.datum(); // output record GenericRecord outputRecord = new Record(outputSchema); // dimensions for (String dimension : dimensionsNames) { String dimensionName = dimension; String dimensionValue = ThirdeyeAvroUtils.getDimensionFromRecord(inputRecord, dimension); // add original dimension value with whitelist applied String whitelistDimensionValue = dimensionValue; if (whitelist != null) { Set<String> whitelistDimensions = whitelist.get(dimensionName); if (CollectionUtils.isNotEmpty(whitelistDimensions)) { // whitelist config exists for this dimension but value not present in whitelist if (!whitelistDimensions.contains(dimensionValue)) { whitelistDimensionValue = ThirdEyeConstants.OTHER; } } } outputRecord.put(dimensionName, whitelistDimensionValue); // add column for topk, if topk config exists for that column if (topKDimensionsMap.containsKey(dimensionName)) { Set<String> topKDimensionValues = topKDimensionsMap.get(dimensionName); // if topk config exists for that dimension if (CollectionUtils.isNotEmpty(topKDimensionValues)) { String topkDimensionName = dimensionName + ThirdEyeConstants.TOPK_DIMENSION_SUFFIX; String topkDimensionValue = dimensionValue; // topk config exists for this dimension, but value not present in topk if (!topKDimensionValues.contains(dimensionValue) && (whitelist == null || whitelist.get(dimensionName) == null || !whitelist.get(dimensionName).contains(dimensionValue))) { topkDimensionValue = ThirdEyeConstants.OTHER; } outputRecord.put(topkDimensionName, topkDimensionValue); } } } // metrics for (String metric : metricNames) { outputRecord.put(metric, ThirdeyeAvroUtils.getMetricFromRecord(inputRecord, metric)); } // time outputRecord.put(timeColumnName, ThirdeyeAvroUtils.getMetricFromRecord(inputRecord, timeColumnName)); AvroKey<GenericRecord> outputKey = new AvroKey<GenericRecord>(outputRecord); context.write(outputKey, NullWritable.get()); } @Override public void cleanup(Context context) throws IOException, InterruptedException { } } }