com.metamx.druid.indexer.HadoopDruidIndexerConfig.java Source code

Introduction

Here is the source code for com.metamx.druid.indexer.HadoopDruidIndexerConfig.java
Source

/*
 * Druid - a distributed column store.
 * Copyright (C) 2012  Metamarkets Group Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

package com.metamx.druid.indexer;

import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.metamx.common.Granularity;
import com.metamx.common.ISE;
import com.metamx.common.MapUtils;
import com.metamx.common.guava.FunctionalIterable;
import com.metamx.common.logger.Logger;
import com.metamx.druid.RegisteringNode;
import com.metamx.druid.aggregation.AggregatorFactory;
import com.metamx.druid.client.DataSegment;
import com.metamx.druid.index.v1.serde.Registererer;
import com.metamx.druid.indexer.data.DataSpec;
import com.metamx.druid.indexer.data.StringInputRowParser;
import com.metamx.druid.indexer.data.TimestampSpec;
import com.metamx.druid.indexer.data.ToLowercaseDataSpec;
import com.metamx.druid.indexer.granularity.GranularitySpec;
import com.metamx.druid.indexer.granularity.UniformGranularitySpec;
import com.metamx.druid.indexer.partitions.PartitionsSpec;
import com.metamx.druid.indexer.path.PathSpec;
import com.metamx.druid.indexer.rollup.DataRollupSpec;
import com.metamx.druid.indexer.updater.UpdaterJobSpec;
import com.metamx.druid.input.InputRow;
import com.metamx.druid.jackson.DefaultObjectMapper;
import com.metamx.druid.shard.ShardSpec;
import com.metamx.druid.utils.JodaUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.codehaus.jackson.JsonGenerator;
import org.codehaus.jackson.annotate.JsonCreator;
import org.codehaus.jackson.annotate.JsonProperty;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.type.TypeReference;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.joda.time.format.ISODateTimeFormat;

import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 */
public class HadoopDruidIndexerConfig {
    public static final Charset javaNativeCharset = Charset.forName("Unicode");

    public static final Splitter tagSplitter = Splitter.on("\u0001");
    public static final Joiner tagJoiner = Joiner.on("\u0001");
    public static final Splitter tabSplitter = Splitter.on("\t");
    public static final Joiner tabJoiner = Joiner.on("\t");
    public static final ObjectMapper jsonMapper;

    static {
        jsonMapper = new DefaultObjectMapper();
        jsonMapper.configure(JsonGenerator.Feature.ESCAPE_NON_ASCII, true);
    }

    public static enum IndexJobCounters {
        INVALID_ROW_COUNTER
    }

    public static HadoopDruidIndexerConfig fromMap(Map<String, Object> argSpec) {
        if (argSpec.containsKey("registererers")) {
            List<Registererer> registererers = Lists.transform(MapUtils.getList(argSpec, "registererers"),
                    new Function<Object, Registererer>() {
                        @Override
                        public Registererer apply(@Nullable Object input) {
                            try {
                                return (Registererer) Class.forName((String) input).newInstance();
                            } catch (Exception e) {
                                throw Throwables.propagate(e);
                            }
                        }
                    });

            RegisteringNode.registerHandlers(registererers, Arrays.asList(jsonMapper));
        }

        return jsonMapper.convertValue(argSpec, HadoopDruidIndexerConfig.class);
    }

    @SuppressWarnings("unchecked")
    public static HadoopDruidIndexerConfig fromFile(File file) {
        try {
            return fromMap(
                    (Map<String, Object>) jsonMapper.readValue(file, new TypeReference<Map<String, Object>>() {
                    }));
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    @SuppressWarnings("unchecked")
    public static HadoopDruidIndexerConfig fromString(String str) {
        try {
            return fromMap(
                    (Map<String, Object>) jsonMapper.readValue(str, new TypeReference<Map<String, Object>>() {
                    }));
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    public static HadoopDruidIndexerConfig fromConfiguration(Configuration conf) {
        final HadoopDruidIndexerConfig retVal = fromString(conf.get(CONFIG_PROPERTY));
        retVal.verify();
        return retVal;
    }

    private static final Logger log = new Logger(HadoopDruidIndexerConfig.class);

    private static final String CONFIG_PROPERTY = "druid.indexer.config";

    private volatile String dataSource;
    private volatile String timestampColumnName;
    private volatile String timestampFormat;
    private volatile DataSpec dataSpec;
    @Deprecated
    private volatile Granularity segmentGranularity;
    private volatile GranularitySpec granularitySpec;
    private volatile PathSpec pathSpec;
    private volatile String jobOutputDir;
    private volatile String segmentOutputDir;
    private volatile DateTime version = new DateTime();
    private volatile PartitionsSpec partitionsSpec;
    private volatile boolean leaveIntermediate = false;
    private volatile boolean cleanupOnFailure = true;
    private volatile Map<DateTime, List<HadoopyShardSpec>> shardSpecs = ImmutableMap.of();
    private volatile boolean overwriteFiles = false;
    private volatile DataRollupSpec rollupSpec;
    private volatile UpdaterJobSpec updaterJobSpec;
    private volatile boolean ignoreInvalidRows = false;
    private volatile List<String> registererers = Lists.newArrayList();

    @JsonCreator
    public HadoopDruidIndexerConfig(final @JsonProperty("intervals") List<Interval> intervals,
            final @JsonProperty("dataSource") String dataSource,
            final @JsonProperty("timestampColumnName") String timestampColumnName,
            final @JsonProperty("timestampFormat") String timestampFormat,
            final @JsonProperty("dataSpec") DataSpec dataSpec,
            final @JsonProperty("segmentGranularity") Granularity segmentGranularity,
            final @JsonProperty("granularitySpec") GranularitySpec granularitySpec,
            final @JsonProperty("pathSpec") PathSpec pathSpec,
            final @JsonProperty("jobOutputDir") String jobOutputDir,
            final @JsonProperty("segmentOutputDir") String segmentOutputDir,
            final @JsonProperty("version") DateTime version,
            final @JsonProperty("partitionDimension") String partitionDimension,
            final @JsonProperty("targetPartitionSize") Long targetPartitionSize,
            final @JsonProperty("partitionsSpec") PartitionsSpec partitionsSpec,
            final @JsonProperty("leaveIntermediate") boolean leaveIntermediate,
            final @JsonProperty("cleanupOnFailure") boolean cleanupOnFailure,
            final @JsonProperty("shardSpecs") Map<DateTime, List<HadoopyShardSpec>> shardSpecs,
            final @JsonProperty("overwriteFiles") boolean overwriteFiles,
            final @JsonProperty("rollupSpec") DataRollupSpec rollupSpec,
            final @JsonProperty("updaterJobSpec") UpdaterJobSpec updaterJobSpec,
            final @JsonProperty("ignoreInvalidRows") boolean ignoreInvalidRows,
            final @JsonProperty("registererers") List<String> registererers) {
        this.dataSource = dataSource;
        this.timestampColumnName = timestampColumnName;
        this.timestampFormat = timestampFormat;
        this.dataSpec = dataSpec;
        this.granularitySpec = granularitySpec;
        this.pathSpec = pathSpec;
        this.jobOutputDir = jobOutputDir;
        this.segmentOutputDir = segmentOutputDir;
        this.version = version;
        this.partitionsSpec = partitionsSpec;
        this.leaveIntermediate = leaveIntermediate;
        this.cleanupOnFailure = cleanupOnFailure;
        this.shardSpecs = shardSpecs;
        this.overwriteFiles = overwriteFiles;
        this.rollupSpec = rollupSpec;
        this.updaterJobSpec = updaterJobSpec;
        this.ignoreInvalidRows = ignoreInvalidRows;
        this.registererers = registererers;

        if (partitionsSpec != null) {
            Preconditions.checkArgument(partitionDimension == null && targetPartitionSize == null,
                    "Cannot mix partitionsSpec with partitionDimension/targetPartitionSize");

            this.partitionsSpec = partitionsSpec;
        } else {
            // Backwards compatibility
            this.partitionsSpec = new PartitionsSpec(partitionDimension, targetPartitionSize, false);
        }

        if (granularitySpec != null) {
            Preconditions.checkArgument(segmentGranularity == null && intervals == null,
                    "Cannot mix granularitySpec with segmentGranularity/intervals");
        } else {
            // Backwards compatibility
            this.segmentGranularity = segmentGranularity;
            if (segmentGranularity != null && intervals != null) {
                this.granularitySpec = new UniformGranularitySpec(segmentGranularity, intervals);
            }
        }
    }

    /**
     * Default constructor does nothing. The caller is expected to use the various setX methods.
     */
    public HadoopDruidIndexerConfig() {
    }

    public List<Interval> getIntervals() {
        return JodaUtils.condenseIntervals(getGranularitySpec().bucketIntervals());
    }

    @Deprecated
    public void setIntervals(List<Interval> intervals) {
        Preconditions.checkState(this.granularitySpec == null, "Cannot mix setIntervals with granularitySpec");
        Preconditions.checkState(this.segmentGranularity != null,
                "Cannot use setIntervals without segmentGranularity");

        // For backwards compatibility
        this.granularitySpec = new UniformGranularitySpec(this.segmentGranularity, intervals);
    }

    @JsonProperty
    public String getDataSource() {
        return dataSource;
    }

    public void setDataSource(String dataSource) {
        this.dataSource = dataSource.toLowerCase();
    }

    @JsonProperty("timestampColumn")
    public String getTimestampColumnName() {
        return timestampColumnName;
    }

    public void setTimestampColumnName(String timestampColumnName) {
        this.timestampColumnName = timestampColumnName;
    }

    @JsonProperty()
    public String getTimestampFormat() {
        return timestampFormat;
    }

    public void setTimestampFormat(String timestampFormat) {
        this.timestampFormat = timestampFormat;
    }

    public TimestampSpec getTimestampSpec() {
        return new TimestampSpec(timestampColumnName, timestampFormat);
    }

    @JsonProperty
    public DataSpec getDataSpec() {
        return dataSpec;
    }

    public void setDataSpec(DataSpec dataSpec) {
        this.dataSpec = new ToLowercaseDataSpec(dataSpec);
    }

    public StringInputRowParser getParser() {
        final List<String> dimensionExclusions;

        if (getDataSpec().hasCustomDimensions()) {
            dimensionExclusions = null;
        } else {
            dimensionExclusions = Lists.newArrayList();
            dimensionExclusions.add(getTimestampColumnName());
            dimensionExclusions
                    .addAll(Lists.transform(getRollupSpec().getAggs(), new Function<AggregatorFactory, String>() {
                        @Override
                        public String apply(AggregatorFactory aggregatorFactory) {
                            return aggregatorFactory.getName();
                        }
                    }));
        }

        return new StringInputRowParser(getTimestampSpec(), getDataSpec(), dimensionExclusions);
    }

    @JsonProperty
    public GranularitySpec getGranularitySpec() {
        return granularitySpec;
    }

    public void setGranularitySpec(GranularitySpec granularitySpec) {
        this.granularitySpec = granularitySpec;
    }

    @JsonProperty
    public PartitionsSpec getPartitionsSpec() {
        return partitionsSpec;
    }

    public void setPartitionsSpec(PartitionsSpec partitionsSpec) {
        this.partitionsSpec = partitionsSpec;
    }

    @JsonProperty
    public PathSpec getPathSpec() {
        return pathSpec;
    }

    public void setPathSpec(PathSpec pathSpec) {
        this.pathSpec = pathSpec;
    }

    @JsonProperty("workingPath")
    public String getJobOutputDir() {
        return jobOutputDir;
    }

    public void setJobOutputDir(String jobOutputDir) {
        this.jobOutputDir = jobOutputDir;
    }

    @JsonProperty("segmentOutputPath")
    public String getSegmentOutputDir() {
        return segmentOutputDir;
    }

    public void setSegmentOutputDir(String segmentOutputDir) {
        this.segmentOutputDir = segmentOutputDir;
    }

    @JsonProperty
    public DateTime getVersion() {
        return version;
    }

    public void setVersion(DateTime version) {
        this.version = version;
    }

    public String getPartitionDimension() {
        return partitionsSpec.getPartitionDimension();
    }

    public boolean partitionByDimension() {
        return partitionsSpec.isDeterminingPartitions();
    }

    public Long getTargetPartitionSize() {
        return partitionsSpec.getTargetPartitionSize();
    }

    public boolean isUpdaterJobSpecSet() {
        return (updaterJobSpec != null);
    }

    @JsonProperty
    public boolean isLeaveIntermediate() {
        return leaveIntermediate;
    }

    public void setLeaveIntermediate(boolean leaveIntermediate) {
        this.leaveIntermediate = leaveIntermediate;
    }

    @JsonProperty
    public boolean isCleanupOnFailure() {
        return cleanupOnFailure;
    }

    public void setCleanupOnFailure(boolean cleanupOnFailure) {
        this.cleanupOnFailure = cleanupOnFailure;
    }

    @JsonProperty
    public Map<DateTime, List<HadoopyShardSpec>> getShardSpecs() {
        return shardSpecs;
    }

    public void setShardSpecs(Map<DateTime, List<HadoopyShardSpec>> shardSpecs) {
        this.shardSpecs = Collections.unmodifiableMap(shardSpecs);
    }

    @JsonProperty
    public boolean isOverwriteFiles() {
        return overwriteFiles;
    }

    public void setOverwriteFiles(boolean overwriteFiles) {
        this.overwriteFiles = overwriteFiles;
    }

    @JsonProperty
    public DataRollupSpec getRollupSpec() {
        return rollupSpec;
    }

    public void setRollupSpec(DataRollupSpec rollupSpec) {
        this.rollupSpec = rollupSpec;
    }

    @JsonProperty
    public UpdaterJobSpec getUpdaterJobSpec() {
        return updaterJobSpec;
    }

    public void setUpdaterJobSpec(UpdaterJobSpec updaterJobSpec) {
        this.updaterJobSpec = updaterJobSpec;
    }

    @JsonProperty
    public boolean isIgnoreInvalidRows() {
        return ignoreInvalidRows;
    }

    public void setIgnoreInvalidRows(boolean ignoreInvalidRows) {
        this.ignoreInvalidRows = ignoreInvalidRows;
    }

    @JsonProperty
    public List<String> getRegistererers() {
        return registererers;
    }

    public void setRegistererers(List<String> registererers) {
        this.registererers = registererers;
    }

    /********************************************
     Granularity/Bucket Helper Methods
     ********************************************/

    /**
     * Get the proper bucket for some input row.
     *
     * @param inputRow an InputRow
     *
     * @return the Bucket that this row belongs to
     */
    public Optional<Bucket> getBucket(InputRow inputRow) {
        final Optional<Interval> timeBucket = getGranularitySpec()
                .bucketInterval(new DateTime(inputRow.getTimestampFromEpoch()));
        if (!timeBucket.isPresent()) {
            return Optional.absent();
        }

        final List<HadoopyShardSpec> shards = shardSpecs.get(timeBucket.get().getStart());
        if (shards == null || shards.isEmpty()) {
            return Optional.absent();
        }

        for (final HadoopyShardSpec hadoopyShardSpec : shards) {
            final ShardSpec actualSpec = hadoopyShardSpec.getActualSpec();
            if (actualSpec.isInChunk(inputRow)) {
                return Optional.of(new Bucket(hadoopyShardSpec.getShardNum(), timeBucket.get().getStart(),
                        actualSpec.getPartitionNum()));
            }
        }

        throw new ISE("row[%s] doesn't fit in any shard[%s]", inputRow, shards);
    }

    public Set<Interval> getSegmentGranularIntervals() {
        return granularitySpec.bucketIntervals();
    }

    public Iterable<Bucket> getAllBuckets() {
        return FunctionalIterable.create(getSegmentGranularIntervals())
                .transformCat(new Function<Interval, Iterable<Bucket>>() {
                    @Override
                    public Iterable<Bucket> apply(Interval input) {
                        final DateTime bucketTime = input.getStart();
                        final List<HadoopyShardSpec> specs = shardSpecs.get(bucketTime);
                        if (specs == null) {
                            return ImmutableList.of();
                        }

                        return FunctionalIterable.create(specs).transform(new Function<HadoopyShardSpec, Bucket>() {
                            int i = 0;

                            @Override
                            public Bucket apply(HadoopyShardSpec input) {
                                return new Bucket(input.getShardNum(), bucketTime, i++);
                            }
                        });
                    }
                });
    }

    public HadoopyShardSpec getShardSpec(Bucket bucket) {
        return shardSpecs.get(bucket.time).get(bucket.partitionNum);
    }

    /******************************************
     Path helper logic
     ******************************************/

    /**
     * Make the intermediate path for this job run.
     *
     * @return the intermediate path for this job run.
     */
    public Path makeIntermediatePath() {
        return new Path(
                String.format("%s/%s/%s", getJobOutputDir(), dataSource, getVersion().toString().replace(":", "")));
    }

    public Path makeSegmentPartitionInfoPath(Bucket bucket) {
        final Interval bucketInterval = getGranularitySpec().bucketInterval(bucket.time).get();

        return new Path(String.format("%s/%s_%s/partitions.json", makeIntermediatePath(),
                ISODateTimeFormat.basicDateTime().print(bucketInterval.getStart()),
                ISODateTimeFormat.basicDateTime().print(bucketInterval.getEnd())));
    }

    public Path makeDescriptorInfoDir() {
        return new Path(makeIntermediatePath(), "segmentDescriptorInfo");
    }

    public Path makeGroupedDataDir() {
        return new Path(makeIntermediatePath(), "groupedData");
    }

    public Path makeDescriptorInfoPath(DataSegment segment) {
        return new Path(makeDescriptorInfoDir(),
                String.format("%s.json", segment.getIdentifier().replace(":", "")));
    }

    public Path makeSegmentOutputPath(Bucket bucket) {
        final Interval bucketInterval = getGranularitySpec().bucketInterval(bucket.time).get();

        return new Path(String.format("%s/%s_%s/%s/%s", getSegmentOutputDir(), bucketInterval.getStart().toString(),
                bucketInterval.getEnd().toString(), getVersion().toString(), bucket.partitionNum));
    }

    public Job addInputPaths(Job job) throws IOException {
        return pathSpec.addInputPaths(this, job);
    }

    public void intoConfiguration(Job job) {
        Configuration conf = job.getConfiguration();

        try {
            conf.set(CONFIG_PROPERTY, jsonMapper.writeValueAsString(this));
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    public void verify() {
        try {
            log.info("Running with config:%n%s",
                    jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(this));
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }

        Preconditions.checkNotNull(dataSource, "dataSource");
        Preconditions.checkNotNull(dataSpec, "dataSpec");
        Preconditions.checkNotNull(timestampColumnName, "timestampColumn");
        Preconditions.checkNotNull(timestampFormat, "timestampFormat");
        Preconditions.checkNotNull(granularitySpec, "granularitySpec");
        Preconditions.checkNotNull(pathSpec, "pathSpec");
        Preconditions.checkNotNull(jobOutputDir, "workingPath");
        Preconditions.checkNotNull(segmentOutputDir, "segmentOutputPath");
        Preconditions.checkNotNull(version, "version");
        Preconditions.checkNotNull(rollupSpec, "rollupSpec");

        final int nIntervals = getIntervals().size();
        Preconditions.checkArgument(nIntervals > 0, "intervals.size()[%s] <= 0", nIntervals);
    }
}