com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseTest.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseTest.java

Source

/**
 * Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.thirdeye.hadoop.aggregation;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.hadoop.io.AvroSerialization;
import org.apache.avro.mapred.AvroKey;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.apache.hadoop.mrunit.testutil.TemporaryPath;
import org.apache.hadoop.mrunit.types.Pair;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfigProperties;
import com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob.AggregationMapper;
import com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob.AggregationReducer;

/**
 * This tests mapper of Aggregation phase, to check conversion of time column to bucket time
 * This also tests reducer to check aggregation using new time values
 */
public class AggregationPhaseTest {
    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

    private static final String HADOOP_IO_SERIALIZATION = "io.serializations";
    private static final String AVRO_SCHEMA = "schema.avsc";

    private String outputPath;
    private Schema inputSchema;
    private ThirdEyeConfig thirdeyeConfig;
    Properties props = new Properties();

    private MapDriver<AvroKey<GenericRecord>, NullWritable, BytesWritable, BytesWritable> mapDriver;
    private ReduceDriver<BytesWritable, BytesWritable, AvroKey<GenericRecord>, NullWritable> reduceDriver;

    private void setUpAvroSerialization(Configuration conf, Schema inputSchema) {
        String[] currentSerializations = conf.getStrings(HADOOP_IO_SERIALIZATION);
        String[] finalSerializations = new String[currentSerializations.length + 1];
        System.arraycopy(currentSerializations, 0, finalSerializations, 0, currentSerializations.length);
        finalSerializations[finalSerializations.length - 1] = AvroSerialization.class.getName();
        mapDriver.getConfiguration().setStrings(HADOOP_IO_SERIALIZATION, finalSerializations);

        AvroSerialization.addToConfiguration(conf);
        AvroSerialization.setKeyWriterSchema(conf, inputSchema);
        AvroSerialization.setValueWriterSchema(conf, Schema.create(Schema.Type.NULL));
    }

    private List<GenericRecord> generateTestMapperData() throws Exception {
        List<GenericRecord> inputRecords = new ArrayList<GenericRecord>();

        // 2016-04-27T190000
        GenericRecord input = new GenericData.Record(inputSchema);
        input.put("d1", "abc1");
        input.put("d2", "pqr1");
        input.put("d3", "xyz1");
        input.put("hoursSinceEpoch", 1461808800000L);
        input.put("m1", 100);
        input.put("m2", 20);
        inputRecords.add(input);

        // 2016-04-27T191000
        input = new GenericData.Record(inputSchema);
        input.put("d1", "abc1");
        input.put("d2", "pqr1");
        input.put("d3", "xyz1");
        input.put("hoursSinceEpoch", 1461809400000L);
        input.put("m1", 100);
        input.put("m2", 20);
        inputRecords.add(input);

        // 2016-04-27T20
        input = new GenericData.Record(inputSchema);
        input.put("d1", "abc2");
        input.put("d2", "pqr2");
        input.put("d3", "xyz2");
        input.put("hoursSinceEpoch", 1461812400000L);
        input.put("m1", 10);
        input.put("m2", 2);
        inputRecords.add(input);

        return inputRecords;
    }

    private List<Pair<BytesWritable, List<BytesWritable>>> generateTestReduceData(
            List<Pair<BytesWritable, BytesWritable>> result) throws Exception {
        List<Pair<BytesWritable, List<BytesWritable>>> inputRecords = new ArrayList<>();
        Map<BytesWritable, List<BytesWritable>> inputMap = new TreeMap<>();

        for (Pair<BytesWritable, BytesWritable> pair : result) {
            inputMap.put(pair.getFirst(), new ArrayList<BytesWritable>());
        }

        for (Pair<BytesWritable, BytesWritable> pair : result) {
            inputMap.get(pair.getFirst()).add(pair.getSecond());
        }
        for (Entry<BytesWritable, List<BytesWritable>> listPair : inputMap.entrySet()) {
            inputRecords.add(new Pair<BytesWritable, List<BytesWritable>>(listPair.getKey(), listPair.getValue()));
        }
        return inputRecords;
    }

    @Before
    public void setUp() throws Exception {

        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString(), "collection");
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString(), "d1,d2,d3");
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString(), "m1,m2");
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), "INT,INT");
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TIMECOLUMN_NAME.toString(), "hoursSinceEpoch");
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_INPUT_TIMECOLUMN_SIZE.toString(), "1");
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_INPUT_TIMECOLUMN_TYPE.toString(),
                TimeUnit.HOURS.toString());
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_INPUT_TIMECOLUMN_SIZE.toString(), "1");
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_INPUT_TIMECOLUMN_TYPE.toString(),
                TimeUnit.MILLISECONDS.toString());
        thirdeyeConfig = ThirdEyeConfig.fromProperties(props);

        // Mapper config
        AggregationMapper mapper = new AggregationMapper();
        mapDriver = MapDriver.newMapDriver(mapper);
        Configuration configuration = mapDriver.getConfiguration();
        configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                + "org.apache.hadoop.io.serializer.WritableSerialization");

        configuration.set(AggregationPhaseConstants.AGG_PHASE_THIRDEYE_CONFIG.toString(),
                OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

        inputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA));
        setUpAvroSerialization(mapDriver.getConfiguration(), inputSchema);

        // Reducer config
        AggregationReducer reducer = new AggregationReducer();
        reduceDriver = ReduceDriver.newReduceDriver(reducer);
        configuration = reduceDriver.getConfiguration();
        configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                + "org.apache.hadoop.io.serializer.WritableSerialization");

        Schema avroSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA));
        configuration.set(AggregationPhaseConstants.AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString());

        configuration.set(AggregationPhaseConstants.AGG_PHASE_THIRDEYE_CONFIG.toString(),
                OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

        TemporaryPath tmpPath = new TemporaryPath();
        outputPath = tmpPath.toString();
        configuration.set(AggregationPhaseConstants.AGG_PHASE_OUTPUT_PATH.toString(), outputPath);
        setUpAvroSerialization(reduceDriver.getConfiguration(), inputSchema);

    }

    @Test
    public void testAggregationPhase() throws Exception {

        int recordCount = 0;
        List<GenericRecord> inputRecords = generateTestMapperData();
        for (GenericRecord record : inputRecords) {
            AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
            inKey.datum(record);
            mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
            recordCount++;
        }

        List<Pair<BytesWritable, BytesWritable>> mapResult = mapDriver.run();
        Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount, mapResult.size());

        AggregationPhaseMapOutputKey keyWrapper = AggregationPhaseMapOutputKey
                .fromBytes(mapResult.get(0).getFirst().getBytes());
        Assert.assertEquals(406058, keyWrapper.getTime());
        keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(1).getFirst().getBytes());
        Assert.assertEquals(406058, keyWrapper.getTime());
        keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(2).getFirst().getBytes());
        Assert.assertEquals(406059, keyWrapper.getTime());

        List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(mapResult);
        reduceDriver.addAll(reduceInput);

        List<Pair<AvroKey<GenericRecord>, NullWritable>> reduceResult = reduceDriver.run();
        Assert.assertEquals("Incorrect number of records returned by aggregation reducer", 2, reduceResult.size());

        GenericRecord record = reduceResult.get(0).getFirst().datum();
        List<String> dimensionsExpected = Lists.newArrayList("abc1", "pqr1", "xyz1");
        List<String> dimensionsActual = getDimensionsFromRecord(record);
        Assert.assertEquals(dimensionsExpected, dimensionsActual);
        List<Integer> metricsExpected = Lists.newArrayList(200, 40);
        List<Integer> metricsActual = getMetricsFromRecord(record);
        Assert.assertEquals(metricsExpected, metricsActual);
        Assert.assertEquals(406058, (long) record.get("hoursSinceEpoch"));

        record = reduceResult.get(1).getFirst().datum();
        dimensionsExpected = Lists.newArrayList("abc2", "pqr2", "xyz2");
        dimensionsActual = getDimensionsFromRecord(record);
        Assert.assertEquals(dimensionsExpected, dimensionsActual);
        metricsExpected = Lists.newArrayList(10, 2);
        metricsActual = getMetricsFromRecord(record);
        Assert.assertEquals(metricsExpected, metricsActual);
        Assert.assertEquals(406059, (long) record.get("hoursSinceEpoch"));
    }

    private List<String> getDimensionsFromRecord(GenericRecord record) {
        List<String> dimensionsActual = new ArrayList<>();
        dimensionsActual.add((String) record.get("d1"));
        dimensionsActual.add((String) record.get("d2"));
        dimensionsActual.add((String) record.get("d3"));
        return dimensionsActual;
    }

    private List<Integer> getMetricsFromRecord(GenericRecord record) {
        List<Integer> metricsActual = new ArrayList<>();
        metricsActual.add((int) record.get("m1"));
        metricsActual.add((int) record.get("m2"));
        return metricsActual;
    }

    @After
    public void cleanUp() throws IOException {

        File f = new File(outputPath);
        FileUtils.deleteDirectory(f);
    }
}