Java tutorial
/** * Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.thirdeye.hadoop.aggregation; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Properties; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyInputFormat; import org.apache.avro.mapreduce.AvroKeyOutputFormat; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import com.linkedin.thirdeye.hadoop.ThirdEyeJobProperties; import com.linkedin.thirdeye.hadoop.config.MetricType; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfigProperties; import com.linkedin.thirdeye.hadoop.config.ThirdEyeConstants; import com.linkedin.thirdeye.hadoop.config.TimeGranularity; import com.linkedin.thirdeye.hadoop.config.TimeSpec; import com.linkedin.thirdeye.hadoop.util.ThirdeyeAggregateMetricUtils; import com.linkedin.thirdeye.hadoop.util.ThirdeyeAvroUtils; import static com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseConstants.*; /** * Buckets input avro data according to granularity specified in config and aggregates metrics * Mapper: * Converts time column into bucket granularity * Reducer: * Aggregates all records with same dimensions in one time bucket */ public class AggregationPhaseJob extends Configured { private static final Logger LOGGER = LoggerFactory.getLogger(AggregationPhaseJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private String name; private Properties props; public AggregationPhaseJob(String name, Properties props) { super(new Configuration()); this.name = name; this.props = props; } public static class AggregationMapper extends Mapper<AvroKey<GenericRecord>, NullWritable, BytesWritable, BytesWritable> { private ThirdEyeConfig thirdeyeConfig; private AggregationPhaseConfig config; private List<String> dimensionsNames; private List<String> metricNames; List<MetricType> metricTypes; private int numMetrics; private String timeColumnName; private TimeGranularity inputGranularity; private TimeGranularity aggregateGranularity; private BytesWritable keyWritable; private BytesWritable valWritable; private int numRecords; @Override public void setup(Context context) throws IOException, InterruptedException { LOGGER.info("AggregationPhaseJob.AggregationPhaseMapper.setup()"); Configuration configuration = context.getConfiguration(); thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get(AGG_PHASE_THIRDEYE_CONFIG.toString()), ThirdEyeConfig.class); config = AggregationPhaseConfig.fromThirdEyeConfig(thirdeyeConfig); dimensionsNames = config.getDimensionNames(); metricNames = config.getMetricNames(); numMetrics = metricNames.size(); metricTypes = config.getMetricTypes(); timeColumnName = config.getTime().getColumnName(); inputGranularity = config.getInputTime().getTimeGranularity(); aggregateGranularity = config.getTime().getTimeGranularity(); keyWritable = new BytesWritable(); valWritable = new BytesWritable(); numRecords = 0; } @Override public void map(AvroKey<GenericRecord> record, NullWritable value, Context context) throws IOException, InterruptedException { // input record GenericRecord inputRecord = record.datum(); // dimensions List<String> dimensions = new ArrayList<>(); for (String dimension : dimensionsNames) { String dimensionValue = ThirdeyeAvroUtils.getDimensionFromRecord(inputRecord, dimension); dimensions.add(dimensionValue); } // metrics Number[] metrics = new Number[numMetrics]; for (int i = 0; i < numMetrics; i++) { Number metricValue = ThirdeyeAvroUtils.getMetricFromRecord(inputRecord, metricNames.get(i), metricTypes.get(i)); metrics[i] = metricValue; } // time long timeValue = ThirdeyeAvroUtils.getMetricFromRecord(inputRecord, timeColumnName).longValue(); long inputTimeMillis = inputGranularity.toMillis(timeValue); long bucketTime = aggregateGranularity.convertToUnit(inputTimeMillis); AggregationPhaseMapOutputKey keyWrapper = new AggregationPhaseMapOutputKey(bucketTime, dimensions); byte[] keyBytes = keyWrapper.toBytes(); keyWritable.set(keyBytes, 0, keyBytes.length); AggregationPhaseMapOutputValue valWrapper = new AggregationPhaseMapOutputValue(metrics, metricTypes); byte[] valBytes = valWrapper.toBytes(); valWritable.set(valBytes, 0, valBytes.length); numRecords++; context.write(keyWritable, valWritable); } @Override public void cleanup(Context context) throws IOException, InterruptedException { context.getCounter(AggregationCounter.NUMBER_OF_RECORDS).increment(numRecords); } } public static class AggregationReducer extends Reducer<BytesWritable, BytesWritable, AvroKey<GenericRecord>, NullWritable> { private Schema avroSchema; private ThirdEyeConfig thirdeyeConfig; private AggregationPhaseConfig config; private List<String> dimensionsNames; private List<String> metricNames; List<MetricType> metricTypes; private int numMetrics; private TimeSpec time; private int numRecords; private Number[] metricSums; @Override public void setup(Context context) throws IOException, InterruptedException { LOGGER.info("AggregationPhaseJob.AggregationPhaseReducer.setup()"); Configuration configuration = context.getConfiguration(); thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get(AGG_PHASE_THIRDEYE_CONFIG.toString()), ThirdEyeConfig.class); config = AggregationPhaseConfig.fromThirdEyeConfig(thirdeyeConfig); dimensionsNames = config.getDimensionNames(); metricNames = config.getMetricNames(); numMetrics = metricNames.size(); metricTypes = config.getMetricTypes(); time = config.getTime(); avroSchema = new Schema.Parser().parse(configuration.get(AGG_PHASE_AVRO_SCHEMA.toString())); numRecords = 0; metricSums = new Number[numMetrics]; Arrays.fill(metricSums, 0); } @Override public void reduce(BytesWritable aggregationKey, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException { // output record GenericRecord outputRecord = new Record(avroSchema); AggregationPhaseMapOutputKey keyWrapper = AggregationPhaseMapOutputKey .fromBytes(aggregationKey.getBytes()); // time long timeValue = keyWrapper.getTime(); outputRecord.put(time.getColumnName(), timeValue); // dimensions List<String> dimensionValues = keyWrapper.getDimensions(); for (int i = 0; i < dimensionsNames.size(); i++) { String dimensionName = dimensionsNames.get(i); String dimensionValue = dimensionValues.get(i); outputRecord.put(dimensionName, dimensionValue); } // aggregate metrics Number[] aggMetricValues = new Number[numMetrics]; Arrays.fill(aggMetricValues, 0); for (BytesWritable value : values) { Number[] metricValues = AggregationPhaseMapOutputValue.fromBytes(value.getBytes(), metricTypes) .getMetricValues(); ThirdeyeAggregateMetricUtils.aggregate(metricTypes, aggMetricValues, metricValues); } ThirdeyeAggregateMetricUtils.aggregate(metricTypes, metricSums, aggMetricValues); // metrics for (int i = 0; i < numMetrics; i++) { String metricName = metricNames.get(i); Number metricValue = aggMetricValues[i]; outputRecord.put(metricName, metricValue); } numRecords++; AvroKey<GenericRecord> outputKey = new AvroKey<GenericRecord>(outputRecord); context.write(outputKey, NullWritable.get()); } @Override public void cleanup(Context context) throws IOException, InterruptedException { context.getCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED).increment(numRecords); for (int i = 0; i < numMetrics; i++) { context.getCounter(thirdeyeConfig.getCollection(), metricNames.get(i)) .increment(metricSums[i].longValue()); } } } public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(AggregationPhaseJob.class); FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); // Properties LOGGER.info("Properties {}", props); // Input Path String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString()); // ThirdEyeConfig String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode()); job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); // Map config job.setMapperClass(AggregationMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Reduce config job.setReducerClass(AggregationReducer.class); job.setOutputKeyClass(AvroKey.class); job.setOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, avroSchema); job.setOutputFormatClass(AvroKeyOutputFormat.class); String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.toString()); if (StringUtils.isNotBlank(numReducers)) { job.setNumReduceTasks(Integer.valueOf(numReducers)); } job.waitForCompletion(true); Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); if (counter.getValue() == 0) { throw new IllegalStateException("No input records in " + inputPathDir); } counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); for (String metric : thirdeyeConfig.getMetricNames()) { counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); } return job; } private String getAndSetConfiguration(Configuration configuration, AggregationPhaseConstants constant) { String value = getAndCheck(constant.toString()); configuration.set(constant.toString(), value); return value; } private String getAndCheck(String propName) { String propValue = props.getProperty(propName); if (propValue == null) { throw new IllegalArgumentException(propName + " required property"); } return propValue; } public static enum AggregationCounter { NUMBER_OF_RECORDS, NUMBER_OF_RECORDS_FLATTENED } public static void main(String[] args) throws Exception { if (args.length != 1) { throw new IllegalArgumentException("usage: config.properties"); } Properties props = new Properties(); props.load(new FileInputStream(args[0])); AggregationPhaseJob job = new AggregationPhaseJob("aggregate_avro_job", props); job.run(); } }