io.druid.indexing.common.task.MergeTaskBase.java Source code

Java tutorial

Introduction

Here is the source code for io.druid.indexing.common.task.MergeTaskBase.java

Source

/*
 * Druid - a distributed column store.
 * Copyright 2012 - 2015 Metamarkets Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.druid.indexing.common.task;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import com.google.common.hash.Hashing;
import com.metamx.common.ISE;
import com.metamx.emitter.EmittingLogger;
import com.metamx.emitter.service.ServiceEmitter;
import com.metamx.emitter.service.ServiceMetricEvent;
import io.druid.indexing.common.TaskLock;
import io.druid.indexing.common.TaskStatus;
import io.druid.indexing.common.TaskToolbox;
import io.druid.indexing.common.actions.SegmentListUsedAction;
import io.druid.indexing.common.actions.TaskActionClient;
import io.druid.segment.IndexIO;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.NoneShardSpec;
import org.joda.time.DateTime;
import org.joda.time.Interval;

import javax.annotation.Nullable;
import java.io.File;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 */
public abstract class MergeTaskBase extends AbstractFixedIntervalTask {
    @JsonIgnore
    private final List<DataSegment> segments;

    private static final EmittingLogger log = new EmittingLogger(MergeTaskBase.class);

    protected MergeTaskBase(final String id, final String dataSource, final List<DataSegment> segments,
            Map<String, Object> context) {
        super(
                // _not_ the version, just something uniqueish
                id != null ? id
                        : String.format("merge_%s_%s", computeProcessingID(dataSource, segments),
                                new DateTime().toString()),
                dataSource, computeMergedInterval(segments), context);

        // Verify segment list is nonempty
        Preconditions.checkArgument(segments.size() > 0, "segments nonempty");
        // Verify segments are all in the correct datasource
        Preconditions.checkArgument(Iterables.size(Iterables.filter(segments, new Predicate<DataSegment>() {
            @Override
            public boolean apply(@Nullable DataSegment segment) {
                return segment == null || !segment.getDataSource().equalsIgnoreCase(dataSource);
            }
        })) == 0, "segments in the wrong datasource");
        // Verify segments are all unsharded
        Preconditions.checkArgument(Iterables.size(Iterables.filter(segments, new Predicate<DataSegment>() {
            @Override
            public boolean apply(@Nullable DataSegment segment) {
                return segment == null || !(segment.getShardSpec() instanceof NoneShardSpec);
            }
        })) == 0, "segments without NoneShardSpec");

        this.segments = segments;
    }

    @Override
    public TaskStatus run(TaskToolbox toolbox) throws Exception {
        final TaskLock myLock = Iterables.getOnlyElement(getTaskLocks(toolbox));
        final ServiceEmitter emitter = toolbox.getEmitter();
        final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder();
        final DataSegment mergedSegment = computeMergedSegment(getDataSource(), myLock.getVersion(), segments);
        final File taskDir = toolbox.getTaskWorkDir();

        try {
            final long startTime = System.currentTimeMillis();

            log.info("Starting merge of id[%s], segments: %s", getId(),
                    Lists.transform(segments, new Function<DataSegment, String>() {
                        @Override
                        public String apply(DataSegment input) {
                            return input.getIdentifier();
                        }
                    }));

            // download segments to merge
            final Map<DataSegment, File> gettedSegments = toolbox.fetchSegments(segments);

            // merge files together
            final File fileToUpload = merge(gettedSegments, new File(taskDir, "merged"));

            emitter.emit(builder.build("merger/numMerged", segments.size()));
            emitter.emit(builder.build("merger/mergeTime", System.currentTimeMillis() - startTime));

            log.info("[%s] : Merged %d segments in %,d millis", mergedSegment.getDataSource(), segments.size(),
                    System.currentTimeMillis() - startTime);

            long uploadStart = System.currentTimeMillis();

            // Upload file
            final DataSegment uploadedSegment = toolbox.getSegmentPusher().push(fileToUpload, mergedSegment);

            emitter.emit(builder.build("merger/uploadTime", System.currentTimeMillis() - uploadStart));
            emitter.emit(builder.build("merger/mergeSize", uploadedSegment.getSize()));

            toolbox.pushSegments(ImmutableList.of(uploadedSegment));

            return TaskStatus.success(getId());
        } catch (Exception e) {
            log.makeAlert(e, "Exception merging[%s]", mergedSegment.getDataSource())
                    .addData("interval", mergedSegment.getInterval()).emit();

            return TaskStatus.failure(getId());
        }
    }

    /**
     * Checks pre-existing segments in "context" to confirm that this merge query is valid. Specifically, confirm that
     * we are operating on every segment that overlaps the chosen interval.
     */
    @Override
    public boolean isReady(TaskActionClient taskActionClient) throws Exception {
        // Try to acquire lock
        if (!super.isReady(taskActionClient)) {
            return false;
        } else {
            final Function<DataSegment, String> toIdentifier = new Function<DataSegment, String>() {
                @Override
                public String apply(DataSegment dataSegment) {
                    return dataSegment.getIdentifier();
                }
            };

            final Set<String> current = ImmutableSet.copyOf(Iterables.transform(
                    taskActionClient.submit(new SegmentListUsedAction(getDataSource(), getInterval())),
                    toIdentifier));
            final Set<String> requested = ImmutableSet.copyOf(Iterables.transform(segments, toIdentifier));

            final Set<String> missingFromRequested = Sets.difference(current, requested);
            if (!missingFromRequested.isEmpty()) {
                throw new ISE("Merge is invalid: current segment(s) are not in the requested set: %s",
                        Joiner.on(", ").join(missingFromRequested));
            }

            final Set<String> missingFromCurrent = Sets.difference(requested, current);
            if (!missingFromCurrent.isEmpty()) {
                throw new ISE("Merge is invalid: requested segment(s) are not in the current set: %s",
                        Joiner.on(", ").join(missingFromCurrent));
            }

            return true;
        }
    }

    protected abstract File merge(Map<DataSegment, File> segments, File outDir) throws Exception;

    @JsonProperty
    public List<DataSegment> getSegments() {
        return segments;
    }

    @Override
    public String toString() {
        return Objects.toStringHelper(this).add("id", getId()).add("dataSource", getDataSource())
                .add("interval", getInterval()).add("segments", segments).toString();
    }

    private static String computeProcessingID(final String dataSource, final List<DataSegment> segments) {
        final String segmentIDs = Joiner.on("_").join(
                Iterables.transform(Ordering.natural().sortedCopy(segments), new Function<DataSegment, String>() {
                    @Override
                    public String apply(DataSegment x) {
                        return String.format("%s_%s_%s_%s", x.getInterval().getStart(), x.getInterval().getEnd(),
                                x.getVersion(), x.getShardSpec().getPartitionNum());
                    }
                }));

        return String.format("%s_%s", dataSource, Hashing.sha1().hashString(segmentIDs, Charsets.UTF_8).toString());
    }

    private static Interval computeMergedInterval(final List<DataSegment> segments) {
        Preconditions.checkArgument(segments.size() > 0, "segments.size() > 0");

        DateTime start = null;
        DateTime end = null;

        for (final DataSegment segment : segments) {
            if (start == null || segment.getInterval().getStart().isBefore(start)) {
                start = segment.getInterval().getStart();
            }

            if (end == null || segment.getInterval().getEnd().isAfter(end)) {
                end = segment.getInterval().getEnd();
            }
        }

        return new Interval(start, end);
    }

    private static DataSegment computeMergedSegment(final String dataSource, final String version,
            final List<DataSegment> segments) {
        final Interval mergedInterval = computeMergedInterval(segments);
        final Set<String> mergedDimensions = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
        final Set<String> mergedMetrics = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);

        for (DataSegment segment : segments) {
            mergedDimensions.addAll(segment.getDimensions());
            mergedMetrics.addAll(segment.getMetrics());
        }

        return DataSegment.builder().dataSource(dataSource).interval(mergedInterval).version(version)
                .binaryVersion(IndexIO.CURRENT_VERSION_ID).shardSpec(new NoneShardSpec())
                .dimensions(Lists.newArrayList(mergedDimensions)).metrics(Lists.newArrayList(mergedMetrics))
                .build();
    }
}