io.druid.indexing.common.task.IndexTask.java Source code

Introduction

Here is the source code for io.druid.indexing.common.task.IndexTask.java
Source

/*
 * Druid - a distributed column store.
 * Copyright 2012 - 2015 Metamarkets Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.druid.indexing.common.task;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.metamx.common.ISE;
import com.metamx.common.guava.Comparators;
import com.metamx.common.logger.Logger;
import io.druid.data.input.Committer;
import io.druid.data.input.Firehose;
import io.druid.data.input.FirehoseFactory;
import io.druid.data.input.InputRow;
import io.druid.data.input.Rows;
import io.druid.granularity.QueryGranularity;
import io.druid.indexing.common.TaskLock;
import io.druid.indexing.common.TaskStatus;
import io.druid.indexing.common.TaskToolbox;
import io.druid.indexing.common.index.YeOldePlumberSchool;
import io.druid.query.aggregation.hyperloglog.HyperLogLogCollector;
import io.druid.segment.IndexSpec;
import io.druid.segment.indexing.DataSchema;
import io.druid.segment.indexing.IOConfig;
import io.druid.segment.indexing.IngestionSpec;
import io.druid.segment.indexing.RealtimeTuningConfig;
import io.druid.segment.indexing.TuningConfig;
import io.druid.segment.indexing.granularity.GranularitySpec;
import io.druid.segment.loading.DataSegmentPusher;
import io.druid.segment.realtime.FireDepartmentMetrics;
import io.druid.segment.realtime.plumber.Committers;
import io.druid.segment.realtime.plumber.Plumber;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.HashBasedNumberedShardSpec;
import io.druid.timeline.partition.NoneShardSpec;
import io.druid.timeline.partition.ShardSpec;
import java.util.Map;
import org.joda.time.DateTime;
import org.joda.time.Interval;

import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.concurrent.CopyOnWriteArrayList;

public class IndexTask extends AbstractFixedIntervalTask {
    private static final Logger log = new Logger(IndexTask.class);

    private static HashFunction hashFunction = Hashing.murmur3_128();

    /**
     * Should we index this inputRow? Decision is based on our interval and shardSpec.
     *
     * @param inputRow the row to check
     *
     * @return true or false
     */
    private static boolean shouldIndex(final ShardSpec shardSpec, final Interval interval, final InputRow inputRow,
            final QueryGranularity rollupGran) {
        return interval.contains(inputRow.getTimestampFromEpoch())
                && shardSpec.isInChunk(rollupGran.truncate(inputRow.getTimestampFromEpoch()), inputRow);
    }

    private static String makeId(String id, IndexIngestionSpec ingestionSchema) {
        if (id == null) {
            return String.format("index_%s_%s", makeDataSource(ingestionSchema), new DateTime().toString());
        }

        return id;
    }

    private static String makeDataSource(IndexIngestionSpec ingestionSchema) {
        return ingestionSchema.getDataSchema().getDataSource();
    }

    private static Interval makeInterval(IndexIngestionSpec ingestionSchema) {
        GranularitySpec spec = ingestionSchema.getDataSchema().getGranularitySpec();

        return new Interval(spec.bucketIntervals().get().first().getStart(),
                spec.bucketIntervals().get().last().getEnd());
    }

    static RealtimeTuningConfig convertTuningConfig(ShardSpec shardSpec, int rowFlushBoundary,
            IndexSpec indexSpec) {
        return new RealtimeTuningConfig(rowFlushBoundary, null, null, null, null, null, null, shardSpec, indexSpec,
                null, null, null, null);
    }

    @JsonIgnore
    private final IndexIngestionSpec ingestionSchema;

    private final ObjectMapper jsonMapper;

    @JsonCreator
    public IndexTask(@JsonProperty("id") String id, @JsonProperty("resource") TaskResource taskResource,
            @JsonProperty("spec") IndexIngestionSpec ingestionSchema, @JacksonInject ObjectMapper jsonMapper,
            @JsonProperty("context") Map<String, Object> context) {
        super(
                // _not_ the version, just something uniqueish
                makeId(id, ingestionSchema), taskResource, makeDataSource(ingestionSchema),
                makeInterval(ingestionSchema), context);

        this.ingestionSchema = ingestionSchema;
        this.jsonMapper = jsonMapper;
    }

    @Override
    public String getType() {
        return "index";
    }

    @JsonProperty("spec")
    public IndexIngestionSpec getIngestionSchema() {
        return ingestionSchema;
    }

    @Override
    public TaskStatus run(TaskToolbox toolbox) throws Exception {
        final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
        final int targetPartitionSize = ingestionSchema.getTuningConfig().getTargetPartitionSize();

        final TaskLock myLock = Iterables.getOnlyElement(getTaskLocks(toolbox));
        final Set<DataSegment> segments = Sets.newHashSet();

        final Set<Interval> validIntervals = Sets.intersection(granularitySpec.bucketIntervals().get(),
                getDataIntervals());
        if (validIntervals.isEmpty()) {
            throw new ISE("No valid data intervals found. Check your configs!");
        }

        for (final Interval bucket : validIntervals) {
            final List<ShardSpec> shardSpecs;
            if (targetPartitionSize > 0) {
                shardSpecs = determinePartitions(bucket, targetPartitionSize,
                        granularitySpec.getQueryGranularity());
            } else {
                int numShards = ingestionSchema.getTuningConfig().getNumShards();
                if (numShards > 0) {
                    shardSpecs = Lists.newArrayList();
                    for (int i = 0; i < numShards; i++) {
                        shardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, jsonMapper));
                    }
                } else {
                    shardSpecs = ImmutableList.<ShardSpec>of(new NoneShardSpec());
                }
            }
            for (final ShardSpec shardSpec : shardSpecs) {
                final DataSegment segment = generateSegment(toolbox, ingestionSchema.getDataSchema(), shardSpec,
                        bucket, myLock.getVersion());
                segments.add(segment);
            }
        }
        toolbox.pushSegments(segments);
        return TaskStatus.success(getId());
    }

    private SortedSet<Interval> getDataIntervals() throws IOException {
        final FirehoseFactory firehoseFactory = ingestionSchema.getIOConfig().getFirehoseFactory();
        final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();

        SortedSet<Interval> retVal = Sets.newTreeSet(Comparators.intervalsByStartThenEnd());
        int unparsed = 0;
        try (Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
            while (firehose.hasMore()) {
                final InputRow inputRow = firehose.nextRow();
                DateTime dt = new DateTime(inputRow.getTimestampFromEpoch());
                Optional<Interval> interval = granularitySpec.bucketInterval(dt);
                if (interval.isPresent()) {
                    retVal.add(interval.get());
                } else {
                    unparsed++;
                }
            }
        }
        if (unparsed > 0) {
            log.warn("Unable to to find a matching interval for [%,d] events", unparsed);
        }

        return retVal;
    }

    private List<ShardSpec> determinePartitions(final Interval interval, final int targetPartitionSize,
            final QueryGranularity queryGranularity) throws IOException {
        log.info("Determining partitions for interval[%s] with targetPartitionSize[%d]", interval,
                targetPartitionSize);

        final FirehoseFactory firehoseFactory = ingestionSchema.getIOConfig().getFirehoseFactory();

        // The implementation of this determine partitions stuff is less than optimal.  Should be done better.
        // Use HLL to estimate number of rows
        HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();

        // Load data
        try (Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
            while (firehose.hasMore()) {
                final InputRow inputRow = firehose.nextRow();
                if (interval.contains(inputRow.getTimestampFromEpoch())) {
                    final List<Object> groupKey = Rows
                            .toGroupKey(queryGranularity.truncate(inputRow.getTimestampFromEpoch()), inputRow);
                    collector.add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
                }
            }
        }

        final double numRows = collector.estimateCardinality();
        log.info("Estimated approximately [%,f] rows of data.", numRows);

        int numberOfShards = (int) Math.ceil(numRows / targetPartitionSize);
        if ((double) numberOfShards > numRows) {
            numberOfShards = (int) numRows;
        }
        log.info("Will require [%,d] shard(s).", numberOfShards);

        // ShardSpecs we will return
        final List<ShardSpec> shardSpecs = Lists.newArrayList();

        if (numberOfShards == 1) {
            shardSpecs.add(new NoneShardSpec());
        } else {
            for (int i = 0; i < numberOfShards; ++i) {
                shardSpecs.add(new HashBasedNumberedShardSpec(i, numberOfShards, jsonMapper));
            }
        }

        return shardSpecs;
    }

    private DataSegment generateSegment(final TaskToolbox toolbox, final DataSchema schema,
            final ShardSpec shardSpec, final Interval interval, final String version) throws IOException {
        // Set up temporary directory.
        final File tmpDir = new File(toolbox.getTaskWorkDir(), String.format("%s_%s_%s_%s_%s", this.getDataSource(),
                interval.getStart(), interval.getEnd(), version, shardSpec.getPartitionNum()));

        final FirehoseFactory firehoseFactory = ingestionSchema.getIOConfig().getFirehoseFactory();
        final int rowFlushBoundary = ingestionSchema.getTuningConfig().getRowFlushBoundary();

        // We need to track published segments.
        final List<DataSegment> pushedSegments = new CopyOnWriteArrayList<DataSegment>();
        final DataSegmentPusher wrappedDataSegmentPusher = new DataSegmentPusher() {
            @Override
            public String getPathForHadoop(String dataSource) {
                return toolbox.getSegmentPusher().getPathForHadoop(dataSource);
            }

            @Override
            public DataSegment push(File file, DataSegment segment) throws IOException {
                final DataSegment pushedSegment = toolbox.getSegmentPusher().push(file, segment);
                pushedSegments.add(pushedSegment);
                return pushedSegment;
            }
        };

        // rowFlushBoundary for this job
        final int myRowFlushBoundary = rowFlushBoundary > 0 ? rowFlushBoundary
                : toolbox.getConfig().getDefaultRowFlushBoundary();

        // Create firehose + plumber
        final FireDepartmentMetrics metrics = new FireDepartmentMetrics();
        final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser());
        final Supplier<Committer> committerSupplier = Committers.supplierFromFirehose(firehose);
        final Plumber plumber = new YeOldePlumberSchool(interval, version, wrappedDataSegmentPusher, tmpDir)
                .findPlumber(schema, convertTuningConfig(shardSpec, myRowFlushBoundary,
                        ingestionSchema.getTuningConfig().getIndexSpec()), metrics);

        final QueryGranularity rollupGran = ingestionSchema.getDataSchema().getGranularitySpec()
                .getQueryGranularity();
        try {
            plumber.startJob();

            while (firehose.hasMore()) {
                final InputRow inputRow = firehose.nextRow();

                if (shouldIndex(shardSpec, interval, inputRow, rollupGran)) {
                    int numRows = plumber.add(inputRow, committerSupplier);
                    if (numRows == -1) {
                        throw new ISE(String.format("Was expecting non-null sink for timestamp[%s]",
                                new DateTime(inputRow.getTimestampFromEpoch())));
                    }
                    metrics.incrementProcessed();
                } else {
                    metrics.incrementThrownAway();
                }
            }
        } finally {
            firehose.close();
        }

        plumber.persist(committerSupplier.get());

        try {
            plumber.finishJob();
        } finally {
            log.info(
                    "Task[%s] interval[%s] partition[%d] took in %,d rows (%,d processed, %,d unparseable, %,d thrown away)"
                            + " and output %,d rows",
                    getId(), interval, shardSpec.getPartitionNum(),
                    metrics.processed() + metrics.unparseable() + metrics.thrownAway(), metrics.processed(),
                    metrics.unparseable(), metrics.thrownAway(), metrics.rowOutput());
        }

        // We expect a single segment to have been created.
        return Iterables.getOnlyElement(pushedSegments);
    }

    public static class IndexIngestionSpec extends IngestionSpec<IndexIOConfig, IndexTuningConfig> {
        private final DataSchema dataSchema;
        private final IndexIOConfig ioConfig;
        private final IndexTuningConfig tuningConfig;

        @JsonCreator
        public IndexIngestionSpec(@JsonProperty("dataSchema") DataSchema dataSchema,
                @JsonProperty("ioConfig") IndexIOConfig ioConfig,
                @JsonProperty("tuningConfig") IndexTuningConfig tuningConfig) {
            super(dataSchema, ioConfig, tuningConfig);

            this.dataSchema = dataSchema;
            this.ioConfig = ioConfig;
            this.tuningConfig = tuningConfig == null ? new IndexTuningConfig(0, 0, null, null) : tuningConfig;
        }

        @Override
        @JsonProperty("dataSchema")
        public DataSchema getDataSchema() {
            return dataSchema;
        }

        @Override
        @JsonProperty("ioConfig")
        public IndexIOConfig getIOConfig() {
            return ioConfig;
        }

        @Override
        @JsonProperty("tuningConfig")
        public IndexTuningConfig getTuningConfig() {
            return tuningConfig;
        }
    }

    @JsonTypeName("index")
    public static class IndexIOConfig implements IOConfig {
        private final FirehoseFactory firehoseFactory;

        @JsonCreator
        public IndexIOConfig(@JsonProperty("firehose") FirehoseFactory firehoseFactory) {
            this.firehoseFactory = firehoseFactory;
        }

        @JsonProperty("firehose")
        public FirehoseFactory getFirehoseFactory() {
            return firehoseFactory;
        }
    }

    @JsonTypeName("index")
    public static class IndexTuningConfig implements TuningConfig {
        private static final int DEFAULT_TARGET_PARTITION_SIZE = 5000000;
        private static final int DEFAULT_ROW_FLUSH_BOUNDARY = 500000;
        private static final IndexSpec DEFAULT_INDEX_SPEC = new IndexSpec();

        private final int targetPartitionSize;
        private final int rowFlushBoundary;
        private final int numShards;
        private final IndexSpec indexSpec;

        @JsonCreator
        public IndexTuningConfig(@JsonProperty("targetPartitionSize") int targetPartitionSize,
                @JsonProperty("rowFlushBoundary") int rowFlushBoundary,
                @JsonProperty("numShards") @Nullable Integer numShards,
                @JsonProperty("indexSpec") @Nullable IndexSpec indexSpec) {
            this.targetPartitionSize = targetPartitionSize == 0 ? DEFAULT_TARGET_PARTITION_SIZE
                    : targetPartitionSize;
            Preconditions.checkArgument(rowFlushBoundary >= 0, "rowFlushBoundary should be positive or zero");
            this.rowFlushBoundary = rowFlushBoundary == 0 ? DEFAULT_ROW_FLUSH_BOUNDARY : rowFlushBoundary;
            this.numShards = numShards == null ? -1 : numShards;
            this.indexSpec = indexSpec == null ? DEFAULT_INDEX_SPEC : indexSpec;
            Preconditions.checkArgument(this.targetPartitionSize == -1 || this.numShards == -1,
                    "targetPartitionsSize and shardCount both cannot be set");
        }

        @JsonProperty
        public int getTargetPartitionSize() {
            return targetPartitionSize;
        }

        @JsonProperty
        public int getRowFlushBoundary() {
            return rowFlushBoundary;
        }

        @JsonProperty
        public int getNumShards() {
            return numShards;
        }

        @JsonProperty
        public IndexSpec getIndexSpec() {
            return indexSpec;
        }
    }
}