org.apache.druid.indexer.HadoopDruidIndexerConfig.java Source code

Introduction

Here is the source code for org.apache.druid.indexer.HadoopDruidIndexerConfig.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexer;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.inject.Binder;
import com.google.inject.Injector;
import com.google.inject.Key;
import com.google.inject.Module;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.impl.InputRowParser;
import org.apache.druid.guice.GuiceInjectors;
import org.apache.druid.guice.JsonConfigProvider;
import org.apache.druid.guice.annotations.Self;
import org.apache.druid.indexer.partitions.DimensionBasedPartitionsSpec;
import org.apache.druid.indexer.path.PathSpec;
import org.apache.druid.initialization.Initialization;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.JodaUtils;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.FunctionalIterable;
import org.apache.druid.java.util.common.jackson.JacksonUtils;
import org.apache.druid.segment.IndexIO;
import org.apache.druid.segment.IndexMerger;
import org.apache.druid.segment.IndexMergerV9;
import org.apache.druid.segment.IndexSpec;
import org.apache.druid.segment.indexing.granularity.GranularitySpec;
import org.apache.druid.segment.loading.DataSegmentPusher;
import org.apache.druid.server.DruidNode;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.ShardSpec;
import org.apache.druid.timeline.partition.ShardSpecLookup;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.joda.time.format.ISODateTimeFormat;

import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;

/**
 */
public class HadoopDruidIndexerConfig {
    private static final Injector injector;

    public static final String CONFIG_PROPERTY = "druid.indexer.config";
    public static final Charset JAVA_NATIVE_CHARSET = Charset.forName("Unicode");
    public static final Splitter TAB_SPLITTER = Splitter.on("\t");
    public static final Joiner TAB_JOINER = Joiner.on("\t");
    public static final ObjectMapper JSON_MAPPER;
    public static final IndexIO INDEX_IO;
    public static final IndexMerger INDEX_MERGER_V9;
    public static final HadoopKerberosConfig HADOOP_KERBEROS_CONFIG;
    public static final DataSegmentPusher DATA_SEGMENT_PUSHER;
    private static final String DEFAULT_WORKING_PATH = "/tmp/druid-indexing";

    static {
        injector = Initialization.makeInjectorWithModules(GuiceInjectors.makeStartupInjector(),
                ImmutableList.of(new Module() {
                    @Override
                    public void configure(Binder binder) {
                        JsonConfigProvider.bindInstance(binder, Key.get(DruidNode.class, Self.class),
                                new DruidNode("hadoop-indexer", null, false, null, null, true, false));
                        JsonConfigProvider.bind(binder, "druid.hadoop.security.kerberos",
                                HadoopKerberosConfig.class);
                    }
                }, new IndexingHadoopModule()));
        JSON_MAPPER = injector.getInstance(ObjectMapper.class);
        INDEX_IO = injector.getInstance(IndexIO.class);
        INDEX_MERGER_V9 = injector.getInstance(IndexMergerV9.class);
        HADOOP_KERBEROS_CONFIG = injector.getInstance(HadoopKerberosConfig.class);
        DATA_SEGMENT_PUSHER = injector.getInstance(DataSegmentPusher.class);
    }

    public enum IndexJobCounters {
        INVALID_ROW_COUNTER, ROWS_PROCESSED_COUNTER, ROWS_PROCESSED_WITH_ERRORS_COUNTER, ROWS_UNPARSEABLE_COUNTER, ROWS_THROWN_AWAY_COUNTER
    }

    public static HadoopDruidIndexerConfig fromSpec(HadoopIngestionSpec spec) {
        return new HadoopDruidIndexerConfig(spec);
    }

    public static HadoopDruidIndexerConfig fromMap(Map<String, Object> argSpec) {
        // Eventually PathSpec needs to get rid of its Hadoop dependency, then maybe this can be ingested directly without
        // the Map<> intermediary

        if (argSpec.containsKey("spec")) {
            return HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(argSpec, HadoopDruidIndexerConfig.class);
        }
        return new HadoopDruidIndexerConfig(
                HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(argSpec, HadoopIngestionSpec.class));
    }

    @SuppressWarnings("unchecked")
    public static HadoopDruidIndexerConfig fromFile(File file) {
        try {
            return fromMap(HadoopDruidIndexerConfig.JSON_MAPPER.readValue(file,
                    JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @SuppressWarnings("unchecked")
    public static HadoopDruidIndexerConfig fromString(String str) {
        // This is a map to try and prevent dependency screwbally-ness
        try {
            return fromMap(HadoopDruidIndexerConfig.JSON_MAPPER.readValue(str,
                    JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @SuppressWarnings("unchecked")
    public static HadoopDruidIndexerConfig fromDistributedFileSystem(String path) {
        try {
            Path pt = new Path(path);
            FileSystem fs = pt.getFileSystem(new Configuration());
            Reader reader = new InputStreamReader(fs.open(pt), StandardCharsets.UTF_8);

            return fromMap(HadoopDruidIndexerConfig.JSON_MAPPER.readValue(reader,
                    JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT));
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static HadoopDruidIndexerConfig fromConfiguration(Configuration conf) {
        final HadoopDruidIndexerConfig retVal = fromString(conf.get(HadoopDruidIndexerConfig.CONFIG_PROPERTY));
        retVal.verify();
        return retVal;
    }

    private HadoopIngestionSpec schema;
    private PathSpec pathSpec;
    private String hadoopJobIdFileName;
    private final Map<Long, ShardSpecLookup> shardSpecLookups = new HashMap<>();
    private final Map<Long, Map<ShardSpec, HadoopyShardSpec>> hadoopShardSpecLookup = new HashMap<>();
    private final Granularity rollupGran;
    private final List<String> allowedHadoopPrefix;

    @JsonCreator
    public HadoopDruidIndexerConfig(final @JsonProperty("spec") HadoopIngestionSpec spec) {
        this.schema = spec;
        this.pathSpec = JSON_MAPPER.convertValue(spec.getIOConfig().getPathSpec(), PathSpec.class);
        for (Map.Entry<Long, List<HadoopyShardSpec>> entry : spec.getTuningConfig().getShardSpecs().entrySet()) {
            if (entry.getValue() == null || entry.getValue().isEmpty()) {
                continue;
            }
            final ShardSpec actualSpec = entry.getValue().get(0).getActualSpec();
            shardSpecLookups.put(entry.getKey(), actualSpec
                    .getLookup(Lists.transform(entry.getValue(), new Function<HadoopyShardSpec, ShardSpec>() {
                        @Override
                        public ShardSpec apply(HadoopyShardSpec input) {
                            return input.getActualSpec();
                        }
                    })));

            Map<ShardSpec, HadoopyShardSpec> innerHadoopShardSpecLookup = new HashMap<>();
            for (HadoopyShardSpec hadoopyShardSpec : entry.getValue()) {
                innerHadoopShardSpecLookup.put(hadoopyShardSpec.getActualSpec(), hadoopyShardSpec);
            }
            hadoopShardSpecLookup.put(entry.getKey(), innerHadoopShardSpecLookup);

        }
        this.rollupGran = spec.getDataSchema().getGranularitySpec().getQueryGranularity();

        // User-specified list plus our additional bonus list.
        this.allowedHadoopPrefix = new ArrayList<>();
        this.allowedHadoopPrefix.add("druid.storage");
        this.allowedHadoopPrefix.add("druid.javascript");
        this.allowedHadoopPrefix.addAll(DATA_SEGMENT_PUSHER.getAllowedPropertyPrefixesForHadoop());
        this.allowedHadoopPrefix.addAll(spec.getTuningConfig().getUserAllowedHadoopPrefix());
    }

    @JsonProperty(value = "spec")
    public HadoopIngestionSpec getSchema() {
        return schema;
    }

    @JsonIgnore
    public PathSpec getPathSpec() {
        return pathSpec;
    }

    public String getDataSource() {
        return schema.getDataSchema().getDataSource();
    }

    public GranularitySpec getGranularitySpec() {
        return schema.getDataSchema().getGranularitySpec();
    }

    public void setGranularitySpec(GranularitySpec granularitySpec) {
        this.schema = schema.withDataSchema(schema.getDataSchema().withGranularitySpec(granularitySpec));
        this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class);
    }

    public DimensionBasedPartitionsSpec getPartitionsSpec() {
        return schema.getTuningConfig().getPartitionsSpec();
    }

    public IndexSpec getIndexSpec() {
        return schema.getTuningConfig().getIndexSpec();
    }

    public IndexSpec getIndexSpecForIntermediatePersists() {
        return schema.getTuningConfig().getIndexSpecForIntermediatePersists();
    }

    public boolean isOverwriteFiles() {
        return schema.getTuningConfig().isOverwriteFiles();
    }

    public void setShardSpecs(Map<Long, List<HadoopyShardSpec>> shardSpecs) {
        this.schema = schema.withTuningConfig(schema.getTuningConfig().withShardSpecs(shardSpecs));
        this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class);
    }

    public Optional<List<Interval>> getIntervals() {
        Optional<SortedSet<Interval>> setOptional = schema.getDataSchema().getGranularitySpec().bucketIntervals();
        if (setOptional.isPresent()) {
            return Optional.of((List<Interval>) JodaUtils.condenseIntervals(setOptional.get()));
        } else {
            return Optional.absent();
        }
    }

    public boolean isDeterminingPartitions() {
        return schema.getTuningConfig().getPartitionsSpec().needsDeterminePartitions(true);
    }

    public int getTargetPartitionSize() {
        final Integer targetPartitionSize = schema.getTuningConfig().getPartitionsSpec().getMaxRowsPerSegment();
        return targetPartitionSize == null ? -1 : targetPartitionSize;
    }

    public boolean isForceExtendableShardSpecs() {
        return schema.getTuningConfig().isForceExtendableShardSpecs();
    }

    public boolean isUpdaterJobSpecSet() {
        return (schema.getIOConfig().getMetadataUpdateSpec() != null);
    }

    public boolean isCombineText() {
        return schema.getTuningConfig().isCombineText();
    }

    public InputRowParser getParser() {
        return schema.getDataSchema().getParser();
    }

    public HadoopyShardSpec getShardSpec(Bucket bucket) {
        return schema.getTuningConfig().getShardSpecs().get(bucket.time.getMillis()).get(bucket.partitionNum);
    }

    public int getShardSpecCount(Bucket bucket) {
        return schema.getTuningConfig().getShardSpecs().get(bucket.time.getMillis()).size();
    }

    public boolean isLogParseExceptions() {
        return schema.getTuningConfig().isLogParseExceptions();
    }

    public int getMaxParseExceptions() {
        return schema.getTuningConfig().getMaxParseExceptions();
    }

    public boolean isUseYarnRMJobStatusFallback() {
        return schema.getTuningConfig().isUseYarnRMJobStatusFallback();
    }

    public void setHadoopJobIdFileName(String hadoopJobIdFileName) {
        this.hadoopJobIdFileName = hadoopJobIdFileName;
    }

    public String getHadoopJobIdFileName() {
        return hadoopJobIdFileName;
    }

    /**
     * Job instance should have Configuration set (by calling {@link #addJobProperties(Job)}
     * or via injected system properties) before this method is called.  The {@link PathSpec} may
     * create objects which depend on the values of these configurations.
     * @param job
     * @return
     * @throws IOException
     */
    public Job addInputPaths(Job job) throws IOException {
        return pathSpec.addInputPaths(this, job);
    }

    /********************************************
     Granularity/Bucket Helper Methods
     ********************************************/

    /**
     * Get the proper bucket for some input row.
     *
     * @param inputRow an InputRow
     *
     * @return the Bucket that this row belongs to
     */
    public Optional<Bucket> getBucket(InputRow inputRow) {
        final Optional<Interval> timeBucket = schema.getDataSchema().getGranularitySpec()
                .bucketInterval(DateTimes.utc(inputRow.getTimestampFromEpoch()));
        if (!timeBucket.isPresent()) {
            return Optional.absent();
        }
        final DateTime bucketStart = timeBucket.get().getStart();
        final ShardSpec actualSpec = shardSpecLookups.get(bucketStart.getMillis())
                .getShardSpec(rollupGran.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
        final HadoopyShardSpec hadoopyShardSpec = hadoopShardSpecLookup.get(bucketStart.getMillis())
                .get(actualSpec);

        return Optional.of(new Bucket(hadoopyShardSpec.getShardNum(), bucketStart, actualSpec.getPartitionNum()));

    }

    public Optional<Set<Interval>> getSegmentGranularIntervals() {
        return Optional.fromNullable(
                (Set<Interval>) schema.getDataSchema().getGranularitySpec().bucketIntervals().orNull());
    }

    public List<Interval> getInputIntervals() {
        return schema.getDataSchema().getGranularitySpec().inputIntervals();
    }

    public Optional<Iterable<Bucket>> getAllBuckets() {
        Optional<Set<Interval>> intervals = getSegmentGranularIntervals();
        if (intervals.isPresent()) {
            return Optional.of((Iterable<Bucket>) FunctionalIterable.create(intervals.get())
                    .transformCat(new Function<Interval, Iterable<Bucket>>() {
                        @Override
                        public Iterable<Bucket> apply(Interval input) {
                            final DateTime bucketTime = input.getStart();
                            final List<HadoopyShardSpec> specs = schema.getTuningConfig().getShardSpecs()
                                    .get(bucketTime.getMillis());
                            if (specs == null) {
                                return ImmutableList.of();
                            }

                            return FunctionalIterable.create(specs)
                                    .transform(new Function<HadoopyShardSpec, Bucket>() {
                                        int i = 0;

                                        @Override
                                        public Bucket apply(HadoopyShardSpec input) {
                                            return new Bucket(input.getShardNum(), bucketTime, i++);
                                        }
                                    });
                        }
                    }));
        } else {
            return Optional.absent();
        }
    }

    public String getWorkingPath() {
        final String workingPath = schema.getTuningConfig().getWorkingPath();
        return workingPath == null ? DEFAULT_WORKING_PATH : workingPath;
    }

    /******************************************
     Path helper logic
     ******************************************/

    /**
     * Make the intermediate path for this job run.
     *
     * @return the intermediate path for this job run.
     */

    public Path makeIntermediatePath() {
        return new Path(StringUtils.format("%s/%s/%s_%s", getWorkingPath(), schema.getDataSchema().getDataSource(),
                StringUtils.removeChar(schema.getTuningConfig().getVersion(), ':'), schema.getUniqueId()));
    }

    public Path makeSegmentPartitionInfoPath(Interval bucketInterval) {
        return new Path(StringUtils.format("%s/%s_%s/partitions.json", makeIntermediatePath(),
                ISODateTimeFormat.basicDateTime().print(bucketInterval.getStart()),
                ISODateTimeFormat.basicDateTime().print(bucketInterval.getEnd())));
    }

    public Path makeIntervalInfoPath() {
        return new Path(StringUtils.format("%s/intervals.json", makeIntermediatePath()));
    }

    public Path makeDescriptorInfoDir() {
        return new Path(makeIntermediatePath(), "segmentDescriptorInfo");
    }

    public Path makeGroupedDataDir() {
        return new Path(makeIntermediatePath(), "groupedData");
    }

    public Path makeDescriptorInfoPath(DataSegment segment) {
        return new Path(makeDescriptorInfoDir(), StringUtils.removeChar(segment.getId() + ".json", ':'));
    }

    public void addJobProperties(Job job) {
        addJobProperties(job.getConfiguration());
    }

    public void addJobProperties(Configuration conf) {
        for (final Map.Entry<String, String> entry : schema.getTuningConfig().getJobProperties().entrySet()) {
            conf.set(entry.getKey(), entry.getValue());
        }
    }

    public void intoConfiguration(Job job) {
        Configuration conf = job.getConfiguration();

        try {
            conf.set(HadoopDruidIndexerConfig.CONFIG_PROPERTY,
                    HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(this));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public void verify() {
        Preconditions.checkNotNull(schema.getDataSchema().getDataSource(), "dataSource");
        Preconditions.checkNotNull(schema.getDataSchema().getParser().getParseSpec(), "parseSpec");
        Preconditions.checkNotNull(schema.getDataSchema().getParser().getParseSpec().getTimestampSpec(),
                "timestampSpec");
        Preconditions.checkNotNull(schema.getDataSchema().getGranularitySpec(), "granularitySpec");
        Preconditions.checkNotNull(pathSpec, "inputSpec");
        Preconditions.checkNotNull(schema.getTuningConfig().getWorkingPath(), "workingPath");
        Preconditions.checkNotNull(schema.getIOConfig().getSegmentOutputPath(), "segmentOutputPath");
        Preconditions.checkNotNull(schema.getTuningConfig().getVersion(), "version");
    }

    public List<String> getAllowedHadoopPrefix() {
        return allowedHadoopPrefix;
    }
}