org.apache.druid.indexer.BatchDeltaIngestionTest.java Source code

Introduction

Here is the source code for org.apache.druid.indexer.BatchDeltaIngestionTest.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexer;

import com.fasterxml.jackson.databind.InjectableValues;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import org.apache.commons.io.FileUtils;
import org.apache.druid.data.input.Firehose;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.impl.CSVParseSpec;
import org.apache.druid.data.input.impl.DimensionsSpec;
import org.apache.druid.data.input.impl.StringInputRowParser;
import org.apache.druid.data.input.impl.TimestampSpec;
import org.apache.druid.hll.HyperLogLogCollector;
import org.apache.druid.indexer.hadoop.WindowedDataSegment;
import org.apache.druid.jackson.DefaultObjectMapper;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.Intervals;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory;
import org.apache.druid.segment.IndexIO;
import org.apache.druid.segment.QueryableIndex;
import org.apache.druid.segment.QueryableIndexStorageAdapter;
import org.apache.druid.segment.StorageAdapter;
import org.apache.druid.segment.indexing.DataSchema;
import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
import org.apache.druid.segment.loading.LocalDataSegmentPuller;
import org.apache.druid.segment.realtime.firehose.IngestSegmentFirehose;
import org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter;
import org.apache.druid.segment.transform.TransformSpec;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec;
import org.joda.time.Interval;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;

public class BatchDeltaIngestionTest {
    @Rule
    public final TemporaryFolder temporaryFolder = new TemporaryFolder();

    private static final ObjectMapper MAPPER;
    private static final IndexIO INDEX_IO;
    private static final Interval INTERVAL_FULL = Intervals.of("2014-10-22T00:00:00Z/P1D");
    private static final Interval INTERVAL_PARTIAL = Intervals.of("2014-10-22T00:00:00Z/PT2H");
    private static final DataSegment SEGMENT;

    static {
        MAPPER = new DefaultObjectMapper();
        MAPPER.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, "hashed"));
        InjectableValues inject = new InjectableValues.Std().addValue(ObjectMapper.class, MAPPER)
                .addValue(DataSegment.PruneLoadSpecHolder.class, DataSegment.PruneLoadSpecHolder.DEFAULT);
        MAPPER.setInjectableValues(inject);
        INDEX_IO = HadoopDruidIndexerConfig.INDEX_IO;

        try {
            SEGMENT = MAPPER
                    .readValue(BatchDeltaIngestionTest.class.getClassLoader()
                            .getResource("test-segment/descriptor.json"), DataSegment.class)
                    .withLoadSpec(ImmutableMap.of("type", "local", "path", BatchDeltaIngestionTest.class
                            .getClassLoader().getResource("test-segment/index.zip").getPath()));
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    @Test
    public void testReindexing() throws Exception {
        List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));

        HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(
                ImmutableMap.of("type", "dataSource", "ingestionSpec",
                        ImmutableMap.of("dataSource", "testds", "interval", INTERVAL_FULL), "segments", segments),
                temporaryFolder.newFolder());

        List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(
                ImmutableMap.of("time", DateTimes.of("2014-10-22T00:00:00.000Z"), "host",
                        ImmutableList.of("a.example.com"), "visited_sum", 100L, "unique_hosts", 1.0d),
                ImmutableMap.of("time", DateTimes.of("2014-10-22T01:00:00.000Z"), "host",
                        ImmutableList.of("b.example.com"), "visited_sum", 150L, "unique_hosts", 1.0d),
                ImmutableMap.of("time", DateTimes.of("2014-10-22T02:00:00.000Z"), "host",
                        ImmutableList.of("c.example.com"), "visited_sum", 200L, "unique_hosts", 1.0d));

        testIngestion(config, expectedRows, Iterables.getOnlyElement(segments), ImmutableList.of("host"),
                ImmutableList.of("visited_sum", "unique_hosts"));
    }

    /**
     * By default re-indexing expects same aggregators as used by original indexing job. But, with additional flag
     * "useNewAggs" in DatasourcePathSpec, user can optionally have any set of aggregators.
     * See https://github.com/apache/incubator-druid/issues/5277 .
     */
    @Test
    public void testReindexingWithNewAggregators() throws Exception {
        List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));

        AggregatorFactory[] aggregators = new AggregatorFactory[] {
                new LongSumAggregatorFactory("visited_sum2", "visited_sum"),
                new HyperUniquesAggregatorFactory("unique_hosts2", "unique_hosts") };

        Map<String, Object> inputSpec = ImmutableMap.of("type", "dataSource", "ingestionSpec",
                ImmutableMap.of("dataSource", "testds", "interval", INTERVAL_FULL), "segments", segments,
                "useNewAggs", true);

        File tmpDir = temporaryFolder.newFolder();

        HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(inputSpec, tmpDir, aggregators);

        List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(
                ImmutableMap.of("time", DateTimes.of("2014-10-22T00:00:00.000Z"), "host",
                        ImmutableList.of("a.example.com"), "visited_sum2", 100L, "unique_hosts2", 1.0d),
                ImmutableMap.of("time", DateTimes.of("2014-10-22T01:00:00.000Z"), "host",
                        ImmutableList.of("b.example.com"), "visited_sum2", 150L, "unique_hosts2", 1.0d),
                ImmutableMap.of("time", DateTimes.of("2014-10-22T02:00:00.000Z"), "host",
                        ImmutableList.of("c.example.com"), "visited_sum2", 200L, "unique_hosts2", 1.0d));

        testIngestion(config, expectedRows, Iterables.getOnlyElement(segments), ImmutableList.of("host"),
                ImmutableList.of("visited_sum2", "unique_hosts2"));
    }

    @Test
    public void testReindexingWithPartialWindow() throws Exception {
        List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_PARTIAL));

        HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(
                ImmutableMap.of("type", "dataSource", "ingestionSpec",
                        ImmutableMap.of("dataSource", "testds", "interval", INTERVAL_FULL), "segments", segments),
                temporaryFolder.newFolder());

        List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(
                ImmutableMap.of("time", DateTimes.of("2014-10-22T00:00:00.000Z"), "host",
                        ImmutableList.of("a.example.com"), "visited_sum", 100L, "unique_hosts", 1.0d),
                ImmutableMap.of("time", DateTimes.of("2014-10-22T01:00:00.000Z"), "host",
                        ImmutableList.of("b.example.com"), "visited_sum", 150L, "unique_hosts", 1.0d));

        testIngestion(config, expectedRows, Iterables.getOnlyElement(segments), ImmutableList.of("host"),
                ImmutableList.of("visited_sum", "unique_hosts"));
    }

    @Test
    public void testDeltaIngestion() throws Exception {
        File tmpDir = temporaryFolder.newFolder();

        File dataFile1 = new File(tmpDir, "data1");
        FileUtils.writeLines(dataFile1, ImmutableList.of("2014102200,a.example.com,a.example.com,90",
                "2014102201,b.example.com,b.example.com,25"));

        File dataFile2 = new File(tmpDir, "data2");
        FileUtils.writeLines(dataFile2, ImmutableList.of("2014102202,c.example.com,c.example.com,70"));

        //using a hadoop glob path to test that it continues to work with hadoop MultipleInputs usage and not
        //affected by
        //https://issues.apache.org/jira/browse/MAPREDUCE-5061
        String inputPath = tmpDir.getPath() + "/{data1,data2}";

        List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));

        HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(ImmutableMap.of("type", "multi", "children",
                ImmutableList.of(ImmutableMap.of("type", "dataSource", "ingestionSpec",
                        ImmutableMap.of("dataSource", "testds", "interval", INTERVAL_FULL), "segments", segments),
                        ImmutableMap.<String, Object>of("type", "static", "paths", inputPath))),
                temporaryFolder.newFolder());

        List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(
                ImmutableMap.of("time", DateTimes.of("2014-10-22T00:00:00.000Z"), "host",
                        ImmutableList.of("a.example.com"), "visited_sum", 190L, "unique_hosts", 1.0d),
                ImmutableMap.of("time", DateTimes.of("2014-10-22T01:00:00.000Z"), "host",
                        ImmutableList.of("b.example.com"), "visited_sum", 175L, "unique_hosts", 1.0d),
                ImmutableMap.of("time", DateTimes.of("2014-10-22T02:00:00.000Z"), "host",
                        ImmutableList.of("c.example.com"), "visited_sum", 270L, "unique_hosts", 1.0d));

        testIngestion(config, expectedRows, Iterables.getOnlyElement(segments), ImmutableList.of("host"),
                ImmutableList.of("visited_sum", "unique_hosts"));
    }

    private void testIngestion(HadoopDruidIndexerConfig config,
            List<ImmutableMap<String, Object>> expectedRowsGenerated, WindowedDataSegment windowedDataSegment,
            List<String> expectedDimensions, List<String> expectedMetrics) throws Exception {
        IndexGeneratorJob job = new IndexGeneratorJob(config);
        Assert.assertTrue(JobHelper.runJobs(ImmutableList.of(job), config));

        File segmentFolder = new File(
                StringUtils.format("%s/%s/%s_%s/%s/0", config.getSchema().getIOConfig().getSegmentOutputPath(),
                        config.getSchema().getDataSchema().getDataSource(), INTERVAL_FULL.getStart().toString(),
                        INTERVAL_FULL.getEnd().toString(), config.getSchema().getTuningConfig().getVersion()));

        Assert.assertTrue(segmentFolder.exists());

        File descriptor = new File(segmentFolder, "descriptor.json");
        File indexZip = new File(segmentFolder, "index.zip");
        Assert.assertTrue(descriptor.exists());
        Assert.assertTrue(indexZip.exists());

        DataSegment dataSegment = MAPPER.readValue(descriptor, DataSegment.class);
        Assert.assertEquals("website", dataSegment.getDataSource());
        Assert.assertEquals(config.getSchema().getTuningConfig().getVersion(), dataSegment.getVersion());
        Assert.assertEquals(INTERVAL_FULL, dataSegment.getInterval());
        Assert.assertEquals("local", dataSegment.getLoadSpec().get("type"));
        Assert.assertEquals(indexZip.getCanonicalPath(), dataSegment.getLoadSpec().get("path"));
        Assert.assertEquals(expectedDimensions, dataSegment.getDimensions());
        Assert.assertEquals(expectedMetrics, dataSegment.getMetrics());
        Assert.assertEquals(Integer.valueOf(9), dataSegment.getBinaryVersion());

        HashBasedNumberedShardSpec spec = (HashBasedNumberedShardSpec) dataSegment.getShardSpec();
        Assert.assertEquals(0, spec.getPartitionNum());
        Assert.assertEquals(1, spec.getPartitions());

        File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
        new LocalDataSegmentPuller().getSegmentFiles(dataSegment, tmpUnzippedSegmentDir);

        QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
        StorageAdapter adapter = new QueryableIndexStorageAdapter(index);

        Firehose firehose = new IngestSegmentFirehose(
                ImmutableList.of(new WindowedStorageAdapter(adapter, windowedDataSegment.getInterval())),
                TransformSpec.NONE, expectedDimensions, expectedMetrics, null);

        List<InputRow> rows = Lists.newArrayList();
        while (firehose.hasMore()) {
            rows.add(firehose.nextRow());
        }

        verifyRows(expectedRowsGenerated, rows, expectedDimensions, expectedMetrics);
    }

    private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map<String, Object> inputSpec, File tmpDir)
            throws Exception {
        return makeHadoopDruidIndexerConfig(inputSpec, tmpDir, null);
    }

    private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map<String, Object> inputSpec, File tmpDir,
            AggregatorFactory[] aggregators) throws Exception {
        HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(
                new DataSchema("website",
                        MAPPER.convertValue(new StringInputRowParser(
                                new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null),
                                        new DimensionsSpec(
                                                DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null,
                                                null),
                                        null, ImmutableList.of("timestamp", "host", "host2", "visited_num"), false,
                                        0),
                                null), Map.class),
                        aggregators != null ? aggregators
                                : new AggregatorFactory[] {
                                        new LongSumAggregatorFactory("visited_sum", "visited_num"),
                                        new HyperUniquesAggregatorFactory("unique_hosts", "host2") },
                        new UniformGranularitySpec(Granularities.DAY, Granularities.NONE,
                                ImmutableList.of(INTERVAL_FULL)),
                        null, MAPPER),
                new HadoopIOConfig(inputSpec, null, tmpDir.getCanonicalPath()),
                new HadoopTuningConfig(tmpDir.getCanonicalPath(), null, null, null, null, null, null, false, false,
                        false, false, null, false, false, null, null, null, false, false, null, null, null)));

        config.setShardSpecs(ImmutableMap.of(INTERVAL_FULL.getStartMillis(), ImmutableList.of(new HadoopyShardSpec(
                new HashBasedNumberedShardSpec(0, 1, null, HadoopDruidIndexerConfig.JSON_MAPPER), 0))));
        config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
        return config;
    }

    private void verifyRows(List<ImmutableMap<String, Object>> expectedRows, List<InputRow> actualRows,
            List<String> expectedDimensions, List<String> expectedMetrics) {
        Assert.assertEquals(expectedRows.size(), actualRows.size());

        for (int i = 0; i < expectedRows.size(); i++) {
            Map<String, Object> expected = expectedRows.get(i);
            InputRow actual = actualRows.get(i);

            Assert.assertEquals(expected.get("time"), actual.getTimestamp());

            Assert.assertEquals(expectedDimensions, actual.getDimensions());

            expectedDimensions.forEach(s -> Assert.assertEquals(expected.get(s), actual.getDimension(s)));

            for (String metric : expectedMetrics) {
                Object actualValue = actual.getRaw(metric);
                if (actualValue instanceof HyperLogLogCollector) {
                    Assert.assertEquals((Double) expected.get(metric),
                            (Double) HyperUniquesAggregatorFactory.estimateCardinality(actualValue, false), 0.001);
                } else {
                    Assert.assertEquals(expected.get(metric), actual.getMetric(metric));
                }
            }
        }
    }
}