io.druid.indexer.DetermineHashedPartitionsJob.java Source code

Introduction

Here is the source code for io.druid.indexer.DetermineHashedPartitionsJob.java
Source

/*
 * Druid - a distributed column store.
 * Copyright 2012 - 2015 Metamarkets Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.druid.indexer;

import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Optional;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.common.io.Closeables;
import com.metamx.common.ISE;
import com.metamx.common.logger.Logger;
import io.druid.data.input.InputRow;
import io.druid.data.input.Rows;
import io.druid.granularity.QueryGranularity;
import io.druid.query.aggregation.hyperloglog.HyperLogLogCollector;
import io.druid.segment.indexing.granularity.UniformGranularitySpec;
import io.druid.timeline.partition.HashBasedNumberedShardSpec;
import io.druid.timeline.partition.NoneShardSpec;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.joda.time.DateTime;
import org.joda.time.DateTimeComparator;
import org.joda.time.Interval;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Determines appropriate ShardSpecs for a job by determining approximate cardinality of data set using HyperLogLog
 */
public class DetermineHashedPartitionsJob implements Jobby {
    private static final Logger log = new Logger(DetermineHashedPartitionsJob.class);
    private final HadoopDruidIndexerConfig config;

    public DetermineHashedPartitionsJob(HadoopDruidIndexerConfig config) {
        this.config = config;
    }

    public boolean run() {
        try {
            /*
             * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
             * in the final segment.
             */
            long startTime = System.currentTimeMillis();
            final Job groupByJob = Job.getInstance(new Configuration(), String
                    .format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));

            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);
            groupByJob.setMapperClass(DetermineCardinalityMapper.class);
            groupByJob.setMapOutputKeyClass(LongWritable.class);
            groupByJob.setMapOutputValueClass(BytesWritable.class);
            groupByJob.setReducerClass(DetermineCardinalityReducer.class);
            groupByJob.setOutputKeyClass(NullWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
            if (!config.getSegmentGranularIntervals().isPresent()) {
                groupByJob.setNumReduceTasks(1);
            } else {
                groupByJob.setNumReduceTasks(config.getSegmentGranularIntervals().get().size());
            }
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                    JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }

            /*
             * Load partitions and intervals determined by the previous job.
             */

            log.info("Job completed, loading up partitions for intervals[%s].",
                    config.getSegmentGranularIntervals());
            FileSystem fileSystem = null;
            if (!config.getSegmentGranularIntervals().isPresent()) {
                final Path intervalInfoPath = config.makeIntervalInfoPath();
                fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
                if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
                    throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
                }
                List<Interval> intervals = config.jsonMapper.readValue(
                        Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
                        });
                config.setGranularitySpec(
                        new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(),
                                config.getGranularitySpec().getQueryGranularity(), intervals));
                log.info("Determined Intervals for Job [%s]" + config.getSegmentGranularIntervals());
            }
            Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
            int shardCount = 0;
            for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
                DateTime bucket = segmentGranularity.getStart();

                final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
                if (fileSystem == null) {
                    fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
                }
                if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
                    final Long numRows = config.jsonMapper.readValue(
                            Utils.openInputStream(groupByJob, partitionInfoPath), new TypeReference<Long>() {
                            });

                    log.info("Found approximately [%,d] rows in data.", numRows);

                    final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());

                    log.info("Creating [%,d] shards", numberOfShards);

                    List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
                    if (numberOfShards == 1) {
                        actualSpecs.add(new HadoopyShardSpec(new NoneShardSpec(), shardCount++));
                    } else {
                        for (int i = 0; i < numberOfShards; ++i) {
                            actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards,
                                    HadoopDruidIndexerConfig.jsonMapper), shardCount++));
                            log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                        }
                    }

                    shardSpecs.put(bucket, actualSpecs);

                } else {
                    log.info("Path[%s] didn't exist!?", partitionInfoPath);
                }
            }
            config.setShardSpecs(shardSpecs);
            log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));

            return true;
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    public static class DetermineCardinalityMapper extends HadoopDruidIndexerMapper<LongWritable, BytesWritable> {
        private static HashFunction hashFunction = Hashing.murmur3_128();
        private QueryGranularity rollupGranularity = null;
        private Map<Interval, HyperLogLogCollector> hyperLogLogs;
        private HadoopDruidIndexerConfig config;
        private boolean determineIntervals;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            rollupGranularity = getConfig().getGranularitySpec().getQueryGranularity();
            config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
            Optional<Set<Interval>> intervals = config.getSegmentGranularIntervals();
            if (intervals.isPresent()) {
                determineIntervals = false;
                final ImmutableMap.Builder<Interval, HyperLogLogCollector> builder = ImmutableMap.builder();
                for (final Interval bucketInterval : intervals.get()) {
                    builder.put(bucketInterval, HyperLogLogCollector.makeLatestCollector());
                }
                hyperLogLogs = builder.build();
            } else {
                determineIntervals = true;
                hyperLogLogs = Maps.newHashMap();
            }
        }

        @Override
        protected void innerMap(InputRow inputRow, Object value, Context context)
                throws IOException, InterruptedException {

            final List<Object> groupKey = Rows
                    .toGroupKey(rollupGranularity.truncate(inputRow.getTimestampFromEpoch()), inputRow);
            Interval interval;
            if (determineIntervals) {
                interval = config.getGranularitySpec().getSegmentGranularity()
                        .bucket(new DateTime(inputRow.getTimestampFromEpoch()));

                if (!hyperLogLogs.containsKey(interval)) {
                    hyperLogLogs.put(interval, HyperLogLogCollector.makeLatestCollector());
                }
            } else {
                final Optional<Interval> maybeInterval = config.getGranularitySpec()
                        .bucketInterval(new DateTime(inputRow.getTimestampFromEpoch()));

                if (!maybeInterval.isPresent()) {
                    throw new ISE("WTF?! No bucket found for timestamp: %s", inputRow.getTimestampFromEpoch());
                }
                interval = maybeInterval.get();
            }
            hyperLogLogs.get(interval).add(hashFunction
                    .hashBytes(HadoopDruidIndexerConfig.jsonMapper.writeValueAsBytes(groupKey)).asBytes());
        }

        @Override
        public void run(Context context) throws IOException, InterruptedException {
            setup(context);

            while (context.nextKeyValue()) {
                map(context.getCurrentKey(), context.getCurrentValue(), context);
            }

            for (Map.Entry<Interval, HyperLogLogCollector> entry : hyperLogLogs.entrySet()) {
                context.write(new LongWritable(entry.getKey().getStartMillis()),
                        new BytesWritable(entry.getValue().toByteArray()));
            }
            cleanup(context);
        }

    }

    public static class DetermineCardinalityReducer
            extends Reducer<LongWritable, BytesWritable, NullWritable, NullWritable> {
        private final List<Interval> intervals = Lists.newArrayList();
        protected HadoopDruidIndexerConfig config = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
        }

        @Override
        protected void reduce(LongWritable key, Iterable<BytesWritable> values, Context context)
                throws IOException, InterruptedException {
            HyperLogLogCollector aggregate = HyperLogLogCollector.makeLatestCollector();
            for (BytesWritable value : values) {
                aggregate.fold(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()));
            }
            Interval interval = config.getGranularitySpec().getSegmentGranularity().bucket(new DateTime(key.get()));
            intervals.add(interval);
            final Path outPath = config.makeSegmentPartitionInfoPath(interval);
            final OutputStream out = Utils.makePathAndOutputStream(context, outPath, config.isOverwriteFiles());

            try {
                HadoopDruidIndexerConfig.jsonMapper.writerWithType(new TypeReference<Long>() {
                }).writeValue(out, new Double(aggregate.estimateCardinality()).longValue());
            } finally {
                Closeables.close(out, false);
            }
        }

        @Override
        public void run(Context context) throws IOException, InterruptedException {
            super.run(context);
            if (!config.getSegmentGranularIntervals().isPresent()) {
                final Path outPath = config.makeIntervalInfoPath();
                final OutputStream out = Utils.makePathAndOutputStream(context, outPath, config.isOverwriteFiles());

                try {
                    HadoopDruidIndexerConfig.jsonMapper.writerWithType(new TypeReference<List<Interval>>() {
                    }).writeValue(out, intervals);
                } finally {
                    Closeables.close(out, false);
                }
            }
        }
    }

    public static class DetermineHashedPartitionsPartitioner extends Partitioner<LongWritable, BytesWritable>
            implements Configurable {
        private Configuration config;
        private boolean determineIntervals;
        private Map<LongWritable, Integer> reducerLookup;

        @Override
        public int getPartition(LongWritable interval, BytesWritable text, int numPartitions) {

            if (config.get("mapred.job.tracker").equals("local") || determineIntervals) {
                return 0;
            } else {
                return reducerLookup.get(interval);
            }
        }

        @Override
        public Configuration getConf() {
            return config;
        }

        @Override
        public void setConf(Configuration config) {
            this.config = config;
            HadoopDruidIndexerConfig hadoopConfig = HadoopDruidIndexerConfig.fromConfiguration(config);
            if (hadoopConfig.getSegmentGranularIntervals().isPresent()) {
                determineIntervals = false;
                int reducerNumber = 0;
                ImmutableMap.Builder<LongWritable, Integer> builder = ImmutableMap.builder();
                for (Interval interval : hadoopConfig.getSegmentGranularIntervals().get()) {
                    builder.put(new LongWritable(interval.getStartMillis()), reducerNumber++);
                }
                reducerLookup = builder.build();
            } else {
                determineIntervals = true;
            }
        }
    }

}