io.druid.indexer.BatchDeltaIngestionTest.java Source code

Java tutorial

Introduction

Here is the source code for io.druid.indexer.BatchDeltaIngestionTest.java

Source

/*
 * Licensed to Metamarkets Group Inc. (Metamarkets) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Metamarkets licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.druid.indexer;

import com.fasterxml.jackson.databind.InjectableValues;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.metamx.common.Granularity;
import io.druid.data.input.Firehose;
import io.druid.data.input.InputRow;
import io.druid.data.input.impl.CSVParseSpec;
import io.druid.data.input.impl.DimensionsSpec;
import io.druid.data.input.impl.StringInputRowParser;
import io.druid.data.input.impl.TimestampSpec;
import io.druid.granularity.QueryGranularity;
import io.druid.indexer.hadoop.WindowedDataSegment;
import io.druid.jackson.DefaultObjectMapper;
import io.druid.query.aggregation.AggregatorFactory;
import io.druid.query.aggregation.LongSumAggregatorFactory;
import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory;
import io.druid.segment.IndexIO;
import io.druid.segment.QueryableIndex;
import io.druid.segment.QueryableIndexStorageAdapter;
import io.druid.segment.StorageAdapter;
import io.druid.segment.indexing.DataSchema;
import io.druid.segment.indexing.granularity.UniformGranularitySpec;
import io.druid.segment.loading.LocalDataSegmentPuller;
import io.druid.segment.realtime.firehose.IngestSegmentFirehose;
import io.druid.segment.realtime.firehose.WindowedStorageAdapter;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.HashBasedNumberedShardSpec;
import org.apache.commons.io.FileUtils;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;

public class BatchDeltaIngestionTest {
    @Rule
    public final TemporaryFolder temporaryFolder = new TemporaryFolder();

    private static final ObjectMapper MAPPER;
    private static final Interval INTERVAL_FULL = new Interval("2014-10-22T00:00:00Z/P1D");
    private static final Interval INTERVAL_PARTIAL = new Interval("2014-10-22T00:00:00Z/PT2H");
    private static final DataSegment SEGMENT;

    static {
        MAPPER = new DefaultObjectMapper();
        MAPPER.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, "hashed"));
        InjectableValues inject = new InjectableValues.Std().addValue(ObjectMapper.class, MAPPER);
        MAPPER.setInjectableValues(inject);

        try {
            SEGMENT = new DefaultObjectMapper()
                    .readValue(BatchDeltaIngestionTest.class.getClassLoader()
                            .getResource("test-segment/descriptor.json"), DataSegment.class)
                    .withLoadSpec(
                            ImmutableMap.<String, Object>of("type", "local", "path", BatchDeltaIngestionTest.class
                                    .getClassLoader().getResource("test-segment/index.zip").getPath()));
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    @Test
    public void testReindexing() throws Exception {
        List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));

        HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(
                ImmutableMap.<String, Object>of("type", "dataSource", "ingestionSpec",
                        ImmutableMap.of("dataSource", "xyz", "interval", INTERVAL_FULL), "segments", segments),
                temporaryFolder.newFolder());

        List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(
                ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T00:00:00.000Z"), "host",
                        ImmutableList.of("a.example.com"), "visited_sum", 100L, "unique_hosts", 1.0d),
                ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T01:00:00.000Z"), "host",
                        ImmutableList.of("b.example.com"), "visited_sum", 150L, "unique_hosts", 1.0d),
                ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T02:00:00.000Z"), "host",
                        ImmutableList.of("c.example.com"), "visited_sum", 200L, "unique_hosts", 1.0d));

        testIngestion(config, expectedRows, Iterables.getOnlyElement(segments));
    }

    @Test
    public void testReindexingWithPartialWindow() throws Exception {
        List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_PARTIAL));

        HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(
                ImmutableMap.<String, Object>of("type", "dataSource", "ingestionSpec",
                        ImmutableMap.of("dataSource", "xyz", "interval", INTERVAL_FULL), "segments", segments),
                temporaryFolder.newFolder());

        List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(
                ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T00:00:00.000Z"), "host",
                        ImmutableList.of("a.example.com"), "visited_sum", 100L, "unique_hosts", 1.0d),
                ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T01:00:00.000Z"), "host",
                        ImmutableList.of("b.example.com"), "visited_sum", 150L, "unique_hosts", 1.0d));

        testIngestion(config, expectedRows, Iterables.getOnlyElement(segments));
    }

    @Test
    public void testDeltaIngestion() throws Exception {
        File tmpDir = temporaryFolder.newFolder();

        File dataFile1 = new File(tmpDir, "data1");
        FileUtils.writeLines(dataFile1, ImmutableList.of("2014102200,a.example.com,a.example.com,90",
                "2014102201,b.example.com,b.example.com,25"));

        File dataFile2 = new File(tmpDir, "data2");
        FileUtils.writeLines(dataFile2, ImmutableList.of("2014102202,c.example.com,c.example.com,70"));

        //using a hadoop glob path to test that it continues to work with hadoop MultipleInputs usage and not
        //affected by
        //https://issues.apache.org/jira/browse/MAPREDUCE-5061
        String inputPath = tmpDir.getPath() + "/{data1,data2}";

        List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));

        HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(
                ImmutableMap.<String, Object>of("type", "multi", "children",
                        ImmutableList.of(
                                ImmutableMap.<String, Object>of("type", "dataSource", "ingestionSpec",
                                        ImmutableMap.of("dataSource", "xyz", "interval", INTERVAL_FULL), "segments",
                                        segments),
                                ImmutableMap.<String, Object>of("type", "static", "paths", inputPath))),
                temporaryFolder.newFolder());

        List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(
                ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T00:00:00.000Z"), "host",
                        ImmutableList.of("a.example.com"), "visited_sum", 190L, "unique_hosts", 1.0d),
                ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T01:00:00.000Z"), "host",
                        ImmutableList.of("b.example.com"), "visited_sum", 175L, "unique_hosts", 1.0d),
                ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T02:00:00.000Z"), "host",
                        ImmutableList.of("c.example.com"), "visited_sum", 270L, "unique_hosts", 1.0d));

        testIngestion(config, expectedRows, Iterables.getOnlyElement(segments));
    }

    private void testIngestion(HadoopDruidIndexerConfig config,
            List<ImmutableMap<String, Object>> expectedRowsGenerated, WindowedDataSegment windowedDataSegment)
            throws Exception {
        IndexGeneratorJob job = new LegacyIndexGeneratorJob(config);
        JobHelper.runJobs(ImmutableList.<Jobby>of(job), config);

        File segmentFolder = new File(
                String.format("%s/%s/%s_%s/%s/0", config.getSchema().getIOConfig().getSegmentOutputPath(),
                        config.getSchema().getDataSchema().getDataSource(), INTERVAL_FULL.getStart().toString(),
                        INTERVAL_FULL.getEnd().toString(), config.getSchema().getTuningConfig().getVersion()));

        Assert.assertTrue(segmentFolder.exists());

        File descriptor = new File(segmentFolder, "descriptor.json");
        File indexZip = new File(segmentFolder, "index.zip");
        Assert.assertTrue(descriptor.exists());
        Assert.assertTrue(indexZip.exists());

        DataSegment dataSegment = MAPPER.readValue(descriptor, DataSegment.class);
        Assert.assertEquals("website", dataSegment.getDataSource());
        Assert.assertEquals(config.getSchema().getTuningConfig().getVersion(), dataSegment.getVersion());
        Assert.assertEquals(INTERVAL_FULL, dataSegment.getInterval());
        Assert.assertEquals("local", dataSegment.getLoadSpec().get("type"));
        Assert.assertEquals(indexZip.getCanonicalPath(), dataSegment.getLoadSpec().get("path"));
        Assert.assertEquals("host", dataSegment.getDimensions().get(0));
        Assert.assertEquals("visited_sum", dataSegment.getMetrics().get(0));
        Assert.assertEquals("unique_hosts", dataSegment.getMetrics().get(1));
        Assert.assertEquals(Integer.valueOf(9), dataSegment.getBinaryVersion());

        HashBasedNumberedShardSpec spec = (HashBasedNumberedShardSpec) dataSegment.getShardSpec();
        Assert.assertEquals(0, spec.getPartitionNum());
        Assert.assertEquals(1, spec.getPartitions());

        File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
        new LocalDataSegmentPuller().getSegmentFiles(dataSegment, tmpUnzippedSegmentDir);

        QueryableIndex index = IndexIO.loadIndex(tmpUnzippedSegmentDir);
        StorageAdapter adapter = new QueryableIndexStorageAdapter(index);

        Firehose firehose = new IngestSegmentFirehose(
                ImmutableList.of(new WindowedStorageAdapter(adapter, windowedDataSegment.getInterval())),
                ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), null,
                QueryGranularity.NONE);

        List<InputRow> rows = Lists.newArrayList();
        while (firehose.hasMore()) {
            rows.add(firehose.nextRow());
        }

        verifyRows(expectedRowsGenerated, rows);
    }

    private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map<String, Object> inputSpec, File tmpDir)
            throws Exception {
        HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(
                new DataSchema("website",
                        MAPPER.convertValue(
                                new StringInputRowParser(
                                        new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null),
                                                new DimensionsSpec(ImmutableList.of("host"), null, null), null,
                                                ImmutableList.of("timestamp", "host", "host2", "visited_num"))),
                                Map.class),
                        new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited_num"),
                                new HyperUniquesAggregatorFactory("unique_hosts", "host2") },
                        new UniformGranularitySpec(Granularity.DAY, QueryGranularity.NONE,
                                ImmutableList.of(INTERVAL_FULL)),
                        MAPPER),
                new HadoopIOConfig(inputSpec, null, tmpDir.getCanonicalPath()),
                new HadoopTuningConfig(tmpDir.getCanonicalPath(), null, null, null, null, null, false, false, false,
                        false, null, false, false, false, null, null, false)));

        config.setShardSpecs(ImmutableMap.<DateTime, List<HadoopyShardSpec>>of(INTERVAL_FULL.getStart(),
                ImmutableList.of(new HadoopyShardSpec(
                        new HashBasedNumberedShardSpec(0, 1, HadoopDruidIndexerConfig.jsonMapper), 0))));
        config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
        return config;
    }

    private void verifyRows(List<ImmutableMap<String, Object>> expectedRows, List<InputRow> actualRows) {
        System.out.println("actualRows = " + actualRows);
        Assert.assertEquals(expectedRows.size(), actualRows.size());

        for (int i = 0; i < expectedRows.size(); i++) {
            Map<String, Object> expected = expectedRows.get(i);
            InputRow actual = actualRows.get(i);

            Assert.assertEquals(ImmutableList.of("host"), actual.getDimensions());

            Assert.assertEquals(expected.get("time"), actual.getTimestamp());
            Assert.assertEquals(expected.get("host"), actual.getDimension("host"));
            Assert.assertEquals(expected.get("visited_sum"), actual.getLongMetric("visited_sum"));
            Assert.assertEquals((Double) expected.get("unique_hosts"),
                    (Double) HyperUniquesAggregatorFactory.estimateCardinality(actual.getRaw("unique_hosts")),
                    0.001);
        }
    }
}