org.apache.druid.indexer.path.DatasourcePathSpec.java Source code

Introduction

Here is the source code for org.apache.druid.indexer.path.DatasourcePathSpec.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexer.path;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.druid.indexer.HadoopDruidIndexerConfig;
import org.apache.druid.indexer.hadoop.DatasourceIngestionSpec;
import org.apache.druid.indexer.hadoop.DatasourceInputFormat;
import org.apache.druid.indexer.hadoop.WindowedDataSegment;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class DatasourcePathSpec implements PathSpec {
    private static final Logger logger = new Logger(DatasourcePathSpec.class);

    public static final String TYPE = "dataSource";

    private final ObjectMapper mapper;
    private final DatasourceIngestionSpec ingestionSpec;
    private final long maxSplitSize;
    private final List<WindowedDataSegment> segments;

    /*
    Note: User would set this flag when they are doing pure re-indexing and would like to have a different
    set of aggregators than the ones used during original indexing.
    Default behavior is to expect same aggregators as used in original data ingestion job to support delta-ingestion
    use case.
     */
    private final boolean useNewAggs;
    private static final String USE_NEW_AGGS_KEY = "useNewAggs";

    @JsonCreator
    public DatasourcePathSpec(@JacksonInject ObjectMapper mapper,
            @JsonProperty("segments") List<WindowedDataSegment> segments,
            @JsonProperty("ingestionSpec") DatasourceIngestionSpec spec,
            @JsonProperty("maxSplitSize") Long maxSplitSize, @JsonProperty(USE_NEW_AGGS_KEY) boolean useNewAggs) {
        this.mapper = Preconditions.checkNotNull(mapper, "null mapper");
        this.segments = segments;
        this.ingestionSpec = Preconditions.checkNotNull(spec, "null ingestionSpec");

        if (maxSplitSize == null) {
            this.maxSplitSize = 0;
        } else {
            this.maxSplitSize = maxSplitSize.longValue();
        }

        this.useNewAggs = useNewAggs;
    }

    @JsonProperty
    public boolean isUseNewAggs() {
        return useNewAggs;
    }

    @JsonProperty
    public List<WindowedDataSegment> getSegments() {
        return segments;
    }

    @JsonProperty
    public DatasourceIngestionSpec getIngestionSpec() {
        return ingestionSpec;
    }

    @JsonProperty
    public long getMaxSplitSize() {
        return maxSplitSize;
    }

    @Override
    public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
        if (segments == null || segments.isEmpty()) {
            if (ingestionSpec.isIgnoreWhenNoSegments()) {
                logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
                return job;
            } else {
                throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
            }
        }

        logger.info("Found total [%d] segments for [%s]  in interval [%s]", segments.size(),
                ingestionSpec.getDataSource(), ingestionSpec.getIntervals());

        DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
        if (updatedIngestionSpec.getDimensions() == null) {
            List<String> dims;
            if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
                dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
            } else {
                Set<String> dimSet = Sets.newHashSet(Iterables.concat(
                        Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {
                            @Override
                            public Iterable<String> apply(WindowedDataSegment dataSegment) {
                                return dataSegment.getSegment().getDimensions();
                            }
                        })));
                dims = Lists.newArrayList(Sets.difference(dimSet,
                        config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
            }
            updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
        }

        if (updatedIngestionSpec.getMetrics() == null) {
            Set<String> metrics = Sets.newHashSet();
            final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
            if (cols != null) {
                if (useNewAggs) {
                    for (AggregatorFactory col : cols) {
                        metrics.addAll(col.requiredFields());
                    }
                } else {
                    for (AggregatorFactory col : cols) {
                        metrics.add(col.getName());
                    }
                }

            }
            updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
        }

        updatedIngestionSpec = updatedIngestionSpec
                .withQueryGranularity(config.getGranularitySpec().getQueryGranularity());

        // propagate in the transformSpec from the overall job config
        updatedIngestionSpec = updatedIngestionSpec
                .withTransformSpec(config.getSchema().getDataSchema().getTransformSpec());

        DatasourceInputFormat.addDataSource(job.getConfiguration(), updatedIngestionSpec, segments, maxSplitSize);
        MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
        return job;
    }

    public static boolean checkIfReindexingAndIsUseAggEnabled(Map<String, Object> configuredPathSpec) {
        return TYPE.equals(configuredPathSpec.get("type"))
                && Boolean.parseBoolean(configuredPathSpec.getOrDefault(USE_NEW_AGGS_KEY, false).toString());
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }

        DatasourcePathSpec that = (DatasourcePathSpec) o;

        if (maxSplitSize != that.maxSplitSize) {
            return false;
        }
        if (!ingestionSpec.equals(that.ingestionSpec)) {
            return false;
        }
        return !(segments != null ? !segments.equals(that.segments) : that.segments != null);

    }

    @Override
    public int hashCode() {
        int result = ingestionSpec.hashCode();
        result = 31 * result + (int) (maxSplitSize ^ (maxSplitSize >>> 32));
        result = 31 * result + (segments != null ? segments.hashCode() : 0);
        return result;
    }
}