com.linkedin.thirdeye.hadoop.topk.TopKPhaseJob.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.thirdeye.hadoop.topk.TopKPhaseJob.java

Source

/**
 * Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.thirdeye.hadoop.topk;

import static com.linkedin.thirdeye.hadoop.topk.TopKPhaseConstants.TOPK_PHASE_INPUT_PATH;
import static com.linkedin.thirdeye.hadoop.topk.TopKPhaseConstants.TOPK_PHASE_OUTPUT_PATH;
import static com.linkedin.thirdeye.hadoop.topk.TopKPhaseConstants.TOPK_PHASE_THIRDEYE_CONFIG;

import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;

import com.linkedin.thirdeye.hadoop.config.MetricType;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfigProperties;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConstants;
import com.linkedin.thirdeye.hadoop.config.TopKDimensionToMetricsSpec;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig;
import com.linkedin.thirdeye.hadoop.util.ThirdeyeAggregateMetricUtils;
import com.linkedin.thirdeye.hadoop.util.ThirdeyeAvroUtils;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.commons.collections.MapUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.MinMaxPriorityQueue;

/**
 * This phase reads avro input, and produces a file with top k values for dimensions
 *
 * Map:
 * Map phase reads avro records, and for each record emits
 * Key=(Dimension name, Dimension Value) Value=(Metrics)
 * For each record, map also emits a
 * Key=(ALL, ALL) Value=(Metrics)
 * This is used for computing the metric sums in the reduce phase
 *
 * Combine:
 * Combine phase receives Key=(DimensionName, DimensionValue)
 * from each map, and aggregates the metric values. This phase
 * helps in reducing the traffic sent to reducer
 *
 * Reduce:
 * We strictly use just 1 reducer.
 * Reduce phase receives Key=(DimensionName, DimensionValue)
 * and aggregates the metric values
 * The very first key received is (ALL, ALL) with total metric sum
 * These metric sums are used to check metric thresholds of other
 * (dimensionName, dimensionValue) pairs. If none of the metric
 * thresholds pass, the pair is discarded.
 * In the cleanup, top k dimension values are picked for each dimension
 * based on the metric value
 * The top k dimension values for each dimension are written to a file
 *
 */
public class TopKPhaseJob extends Configured {
    private static final Logger LOGGER = LoggerFactory.getLogger(TopKPhaseJob.class);

    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
    private static final String TOPK_ALL_DIMENSION_NAME = "0";
    private static final String TOPK_ALL_DIMENSION_VALUE = "0";

    private String name;
    private Properties props;

    /**
     * @param name
     * @param props
     */
    public TopKPhaseJob(String name, Properties props) {
        super(new Configuration());
        this.name = name;
        this.props = props;
    }

    public static class TopKPhaseMapper
            extends Mapper<AvroKey<GenericRecord>, NullWritable, BytesWritable, BytesWritable> {

        private TopKPhaseConfig config;
        ThirdEyeConfig thirdeyeConfig;
        private List<String> dimensionNames;
        private List<String> metricNames;
        private List<MetricType> metricTypes;
        private int numMetrics;
        BytesWritable keyWritable;
        BytesWritable valWritable;
        Map<String, Integer> dimensionNameToIndexMapping;
        Map<String, Long> metricSums;

        @Override
        public void setup(Context context) throws IOException, InterruptedException {
            LOGGER.info("TopKPhaseJob.TopKPhaseMapper.setup()");
            Configuration configuration = context.getConfiguration();
            try {
                thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get(TOPK_PHASE_THIRDEYE_CONFIG.toString()),
                        ThirdEyeConfig.class);
                config = TopKPhaseConfig.fromThirdEyeConfig(thirdeyeConfig);
                dimensionNames = config.getDimensionNames();
                metricNames = config.getMetricNames();
                metricTypes = config.getMetricTypes();
                numMetrics = metricNames.size();
                valWritable = new BytesWritable();
                keyWritable = new BytesWritable();
                dimensionNameToIndexMapping = new HashMap<String, Integer>();
                for (int i = 0; i < dimensionNames.size(); i++) {
                    dimensionNameToIndexMapping.put(dimensionNames.get(i), i);
                }
                metricSums = new HashMap<String, Long>();
                for (String metricName : metricNames) {
                    metricSums.put(metricName, 0L);
                }
            } catch (Exception e) {
                throw new IOException(e);
            }
        }

        @Override
        public void map(AvroKey<GenericRecord> key, NullWritable value, Context context)
                throws IOException, InterruptedException {

            // input record
            GenericRecord inputRecord = key.datum();

            // read metrics
            Number[] metricValues = new Number[numMetrics];
            for (int i = 0; i < numMetrics; i++) {
                String metricName = metricNames.get(i);
                Number metricValue = ThirdeyeAvroUtils.getMetricFromRecord(inputRecord, metricName);
                metricValues[i] = metricValue;
            }
            TopKPhaseMapOutputValue valWrapper = new TopKPhaseMapOutputValue(metricValues, metricTypes);
            byte[] valBytes = valWrapper.toBytes();
            valWritable.set(valBytes, 0, valBytes.length);

            // read dimensions
            for (String dimensionName : dimensionNames) {
                String dimensionValue = ThirdeyeAvroUtils.getDimensionFromRecord(inputRecord, dimensionName);

                TopKPhaseMapOutputKey keyWrapper = new TopKPhaseMapOutputKey(dimensionName, dimensionValue);
                byte[] keyBytes = keyWrapper.toBytes();
                keyWritable.set(keyBytes, 0, keyBytes.length);
                context.write(keyWritable, valWritable);

                keyWrapper = new TopKPhaseMapOutputKey(TOPK_ALL_DIMENSION_NAME, TOPK_ALL_DIMENSION_VALUE);
                keyBytes = keyWrapper.toBytes();
                keyWritable.set(keyBytes, 0, keyBytes.length);
                context.write(keyWritable, valWritable);
            }
        }

        @Override
        public void cleanup(Context context) throws IOException, InterruptedException {

        }
    }

    public static class TopKPhaseCombiner
            extends Reducer<BytesWritable, BytesWritable, BytesWritable, BytesWritable> {

        private TopKPhaseConfig config;
        ThirdEyeConfig thirdeyeConfig;
        private List<MetricType> metricTypes;
        private int numMetrics;
        BytesWritable keyWritable;
        BytesWritable valWritable;

        @Override
        public void setup(Context context) throws IOException, InterruptedException {
            LOGGER.info("TopKPhaseJob.TopKPhaseCombiner.setup()");
            Configuration configuration = context.getConfiguration();
            try {
                thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get(TOPK_PHASE_THIRDEYE_CONFIG.toString()),
                        ThirdEyeConfig.class);
                config = TopKPhaseConfig.fromThirdEyeConfig(thirdeyeConfig);
                metricTypes = config.getMetricTypes();
                numMetrics = metricTypes.size();
                valWritable = new BytesWritable();
                keyWritable = new BytesWritable();

            } catch (Exception e) {
                throw new IOException(e);
            }
        }

        @Override
        public void reduce(BytesWritable key, Iterable<BytesWritable> values, Context context)
                throws IOException, InterruptedException {

            Number[] aggMetricValues = new Number[numMetrics];
            Arrays.fill(aggMetricValues, 0);

            for (BytesWritable value : values) {
                TopKPhaseMapOutputValue valWrapper = TopKPhaseMapOutputValue.fromBytes(value.getBytes(),
                        metricTypes);
                Number[] metricValues = valWrapper.getMetricValues();
                ThirdeyeAggregateMetricUtils.aggregate(metricTypes, aggMetricValues, metricValues);
            }

            TopKPhaseMapOutputValue valWrapper = new TopKPhaseMapOutputValue(aggMetricValues, metricTypes);
            byte[] valBytes = valWrapper.toBytes();
            valWritable.set(valBytes, 0, valBytes.length);

            context.write(key, valWritable);
        }
    }

    public static class TopKPhaseReducer extends Reducer<BytesWritable, BytesWritable, NullWritable, NullWritable> {

        private FileSystem fileSystem;
        private Configuration configuration;

        private ThirdEyeConfig thirdeyeConfig;
        private TopKPhaseConfig config;
        private List<String> dimensionNames;
        private List<String> metricNames;
        private List<MetricType> metricTypes;
        private Map<String, Integer> metricToIndexMapping;
        private int numMetrics;
        BytesWritable keyWritable;
        BytesWritable valWritable;
        Number[] metricSums;
        private Map<String, Map<String, Number[]>> dimensionNameToValuesMap;
        private TopKDimensionValues topkDimensionValues;
        private Map<String, Double> metricThresholds;
        private Map<String, Integer> thresholdPassCount;
        private Map<String, TopKDimensionToMetricsSpec> topKDimensionToMetricsSpecMap;
        private Map<String, Set<String>> whitelist;

        @Override
        public void setup(Context context) throws IOException, InterruptedException {

            LOGGER.info("TopKPhaseJob.TopKPhaseReducer.setup()");

            configuration = context.getConfiguration();
            fileSystem = FileSystem.get(configuration);
            try {
                thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get(TOPK_PHASE_THIRDEYE_CONFIG.toString()),
                        ThirdEyeConfig.class);
                config = TopKPhaseConfig.fromThirdEyeConfig(thirdeyeConfig);
                metricThresholds = config.getMetricThresholds();
                topKDimensionToMetricsSpecMap = config.getTopKDimensionToMetricsSpec();
                dimensionNames = config.getDimensionNames();
                metricNames = config.getMetricNames();
                metricTypes = config.getMetricTypes();
                whitelist = config.getWhitelist();

                numMetrics = metricNames.size();

                metricToIndexMapping = new HashMap<>();
                for (int i = 0; i < numMetrics; i++) {
                    metricToIndexMapping.put(metricNames.get(i), i);
                }

                dimensionNameToValuesMap = new HashMap<>();
                thresholdPassCount = new HashMap<>();
                for (String dimension : dimensionNames) {
                    dimensionNameToValuesMap.put(dimension, new HashMap<String, Number[]>());
                    thresholdPassCount.put(dimension, 0);
                }
                topkDimensionValues = new TopKDimensionValues();

                keyWritable = new BytesWritable();
                valWritable = new BytesWritable();

            } catch (Exception e) {
                throw new IOException(e);
            }
        }

        @Override
        public void reduce(BytesWritable key, Iterable<BytesWritable> values, Context context)
                throws IOException, InterruptedException {

            TopKPhaseMapOutputKey keyWrapper = TopKPhaseMapOutputKey.fromBytes(key.getBytes());
            String dimensionName = keyWrapper.getDimensionName();
            String dimensionValue = keyWrapper.getDimensionValue();

            // Get aggregate metric values for dimension name value pair
            Number[] aggMetricValues = new Number[numMetrics];
            Arrays.fill(aggMetricValues, 0);
            for (BytesWritable value : values) {
                TopKPhaseMapOutputValue valWrapper = TopKPhaseMapOutputValue.fromBytes(value.getBytes(),
                        metricTypes);
                Number[] metricValues = valWrapper.getMetricValues();
                ThirdeyeAggregateMetricUtils.aggregate(metricTypes, aggMetricValues, metricValues);
            }

            // Metric sums case
            if (dimensionName.equals(TOPK_ALL_DIMENSION_NAME) && dimensionValue.equals(TOPK_ALL_DIMENSION_VALUE)) {
                LOGGER.info("Setting metric sums");
                metricSums = new Number[numMetrics];
                metricSums = Arrays.copyOf(aggMetricValues, numMetrics);
                return;
            }

            // Check metric percentage threshold
            if (MapUtils.isNotEmpty(metricThresholds)) {
                boolean isPassThreshold = false;
                for (int i = 0; i < numMetrics; i++) {
                    String metric = metricNames.get(i);
                    double metricValue = aggMetricValues[i].doubleValue();
                    double metricSum = metricSums[i].doubleValue();
                    double metricThresholdPercentage = metricThresholds.get(metric);
                    if (metricValue > (metricSum * metricThresholdPercentage / 100)) {
                        isPassThreshold = true;
                        thresholdPassCount.put(dimensionName, thresholdPassCount.get(dimensionName) + 1);
                        break;
                    }
                }
                if (!isPassThreshold) {
                    return;
                }
                dimensionNameToValuesMap.get(dimensionName).put(dimensionValue, aggMetricValues);
            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {

            for (String dimension : dimensionNames) {

                LOGGER.info("{} records passed metric threshold for dimension {}",
                        thresholdPassCount.get(dimension), dimension);

                // Get top k
                TopKDimensionToMetricsSpec topkSpec = topKDimensionToMetricsSpecMap.get(dimension);
                if (topkSpec != null && topkSpec.getDimensionName() != null && topkSpec.getTopk() != null) {

                    // Get top k for each metric specified
                    Map<String, Integer> topkMetricsMap = topkSpec.getTopk();
                    for (Entry<String, Integer> topKEntry : topkMetricsMap.entrySet()) {

                        String metric = topKEntry.getKey();
                        int k = topKEntry.getValue();
                        MinMaxPriorityQueue<DimensionValueMetricPair> topKQueue = MinMaxPriorityQueue.maximumSize(k)
                                .create();

                        Map<String, Number[]> dimensionToMetricsMap = dimensionNameToValuesMap.get(dimension);
                        for (Entry<String, Number[]> entry : dimensionToMetricsMap.entrySet()) {
                            topKQueue.add(new DimensionValueMetricPair(entry.getKey(),
                                    entry.getValue()[metricToIndexMapping.get(metric)]));
                        }
                        LOGGER.info("Picking Top {} values for {} based on Metric {} : {}", k, dimension, metric,
                                topKQueue);
                        for (DimensionValueMetricPair pair : topKQueue) {
                            topkDimensionValues.addValue(dimension, pair.getDimensionValue());
                        }
                    }
                }
            }

            if (topkDimensionValues.getTopKDimensions().size() > 0) {
                String topkValuesPath = configuration.get(TOPK_PHASE_OUTPUT_PATH.toString());
                LOGGER.info("Writing top k values to {}", topkValuesPath);
                FSDataOutputStream topKDimensionValuesOutputStream = fileSystem
                        .create(new Path(topkValuesPath + File.separator + ThirdEyeConstants.TOPK_VALUES_FILE));
                OBJECT_MAPPER.writeValue((DataOutput) topKDimensionValuesOutputStream, topkDimensionValues);
                topKDimensionValuesOutputStream.close();
            }
        }
    }

    public Job run() throws Exception {
        Job job = Job.getInstance(getConf());
        job.setJobName(name);
        job.setJarByClass(TopKPhaseJob.class);

        Configuration configuration = job.getConfiguration();
        FileSystem fs = FileSystem.get(configuration);

        // Properties
        LOGGER.info("Properties {}", props);

        // Input Path
        String inputPathDir = getAndSetConfiguration(configuration, TOPK_PHASE_INPUT_PATH);
        LOGGER.info("Input path dir: " + inputPathDir);
        for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) {
            LOGGER.info("Adding input:" + inputPath);
            Path input = new Path(inputPath);
            FileInputFormat.addInputPath(job, input);
        }

        // Output path
        Path outputPath = new Path(getAndSetConfiguration(configuration, TOPK_PHASE_OUTPUT_PATH));
        LOGGER.info("Output path dir: " + outputPath.toString());
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        FileOutputFormat.setOutputPath(job, outputPath);

        // Schema
        Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
        LOGGER.info("Schema : {}", avroSchema.toString(true));

        // ThirdEyeConfig
        String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
                props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
                props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
        props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
        ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
        LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode());
        job.getConfiguration().set(TOPK_PHASE_THIRDEYE_CONFIG.toString(),
                OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

        // Map config
        job.setMapperClass(TopKPhaseMapper.class);
        job.setInputFormatClass(AvroKeyInputFormat.class);
        job.setMapOutputKeyClass(BytesWritable.class);
        job.setMapOutputValueClass(BytesWritable.class);

        // Combiner
        job.setCombinerClass(TopKPhaseCombiner.class);

        // Reduce config
        job.setReducerClass(TopKPhaseReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setNumReduceTasks(1);

        job.waitForCompletion(true);

        return job;
    }

    private String getAndSetConfiguration(Configuration configuration, TopKPhaseConstants constant) {
        String value = getAndCheck(constant.toString());
        configuration.set(constant.toString(), value);
        return value;
    }

    private String getAndCheck(String propName) {
        String propValue = props.getProperty(propName);
        if (propValue == null) {
            throw new IllegalArgumentException(propName + " required property");
        }
        return propValue;
    }

}