com.datatorrent.demos.dimensions.generic.DimensionStoreOperatorTest.java Source code

Java tutorial

Introduction

Here is the source code for com.datatorrent.demos.dimensions.generic.DimensionStoreOperatorTest.java

Source

/*
 * Copyright (c) 2014 DataTorrent, Inc. ALL Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datatorrent.demos.dimensions.generic;

import java.io.File;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import com.google.common.collect.Maps;

import org.apache.commons.io.FileUtils;
import org.codehaus.jettison.json.JSONObject;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;

import com.datatorrent.api.Context.OperatorContext;
import com.datatorrent.contrib.hdht.tfile.TFileImpl;
import com.datatorrent.demos.dimensions.generic.DimensionStoreOperator.HDSRangeQueryResult;
import com.datatorrent.lib.testbench.CollectorTestSink;
import com.datatorrent.lib.util.TestUtils;
import com.google.common.util.concurrent.MoreExecutors;

public class DimensionStoreOperatorTest {
    @Rule
    public final TestUtils.TestInfo testInfo = new TestUtils.TestInfo();

    @Test
    public void testQueryFromHDS() throws Exception {
        File file = new File(testInfo.getDir());
        FileUtils.deleteDirectory(file);

        DimensionStoreOperator hdsOut = new DimensionStoreOperator() {
            @Override
            public void setup(OperatorContext arg0) {
                super.setup(arg0);
                super.writeExecutor = super.queryExecutor = MoreExecutors.sameThreadExecutor(); // synchronous processing
            }
        };
        TFileImpl hdsFile = new TFileImpl.DefaultTFileImpl();
        hdsOut.setFileStore(hdsFile);
        hdsFile.setBasePath(testInfo.getDir());
        EventSchema eventSchema = GenericAggregateSerializerTest.getEventSchema();
        GenericAggregator aggregator = new GenericAggregator(eventSchema);
        aggregator.init("time=MINUTES:pubId:adId:adUnit");
        hdsOut.setEventSchemaJSON(GenericAggregateSerializerTest.TEST_SCHEMA_JSON);
        hdsOut.setAggregator(aggregator);
        hdsOut.setMaxCacheSize(1);
        hdsOut.setFlushIntervalCount(0);
        hdsOut.setup(null);

        CollectorTestSink<DimensionStoreOperator.HDSRangeQueryResult> queryResults = new CollectorTestSink<DimensionStoreOperator.HDSRangeQueryResult>();
        @SuppressWarnings({ "unchecked", "rawtypes" })
        CollectorTestSink<Object> tmp = (CollectorTestSink) queryResults;
        hdsOut.queryResult.setSink(tmp);

        hdsOut.beginWindow(1);

        long baseTime = System.currentTimeMillis();
        long baseMinute = TimeUnit.MILLISECONDS.convert(TimeUnit.MINUTES.convert(baseTime, TimeUnit.MILLISECONDS),
                TimeUnit.MINUTES);

        // Events ae1 and ae2 fall into same aggregation as they have same key
        Map<String, Object> eventMap = Maps.newHashMap();
        eventMap.put("timestamp", baseMinute);
        eventMap.put("pubId", 1);
        eventMap.put("adId", 2);
        eventMap.put("adUnit", 3);
        eventMap.put("clicks", 10L);

        GenericAggregate ae1 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae1);

        // Modify click count and create new event
        eventMap.put("clicks", 20L);
        GenericAggregate ae2 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae2);

        // Modify clicks to 10 and time by 1 minute and create new event
        eventMap.put("clicks", 10L);
        eventMap.put("timestamp", baseMinute + TimeUnit.MILLISECONDS.convert(1, TimeUnit.MINUTES));
        GenericAggregate ae3 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae3);

        hdsOut.endWindow();

        hdsOut.beginWindow(2);

        JSONObject keys = new JSONObject();
        keys.put("pubId", 1);
        keys.put("adId", 2);
        keys.put("adUnit", 3);

        JSONObject query = new JSONObject();
        query.put("numResults", "20");
        query.put("keys", keys);
        query.put("id", "query1");
        query.put("startTime", baseMinute);
        query.put("endTime", baseMinute + TimeUnit.MILLISECONDS.convert(20, TimeUnit.MINUTES));

        hdsOut.query.process(query.toString());

        Assert.assertEquals("timeSeriesQueries " + hdsOut.rangeQueries, 1, hdsOut.rangeQueries.size());
        DimensionStoreOperator.HDSRangeQuery aq = hdsOut.rangeQueries.values().iterator().next();
        Assert.assertEquals("numTimeUnits " + hdsOut.rangeQueries, baseMinute, aq.startTime);

        hdsOut.endWindow();

        Assert.assertEquals("queryResults " + queryResults.collectedTuples, 1, queryResults.collectedTuples.size());
        HDSRangeQueryResult r = queryResults.collectedTuples.iterator().next();
        Assert.assertEquals("result points " + r, 2, r.data.size());

        // ae1 object is stored as referenced in cache, and when new tuple is aggregated,
        // the new values are updated in ae1 itself, causing following check to fail.
        //Assert.assertEquals("clicks", ae1.clicks + ae2.clicks, r.data.get(0).clicks);
        Assert.assertEquals("clicks", 30L, r.data.get(0).get("clicks"));
        Assert.assertEquals("clicks", eventSchema.getValue(ae3, "clicks"), r.data.get(1).get("clicks"));
    }

    @Test
    public void testQueryFromHDSWithSubsetKeys() throws Exception {
        File file = new File(testInfo.getDir());
        FileUtils.deleteDirectory(file);

        DimensionStoreOperator hdsOut = new DimensionStoreOperator() {
            @Override
            public void setup(OperatorContext arg0) {
                super.setup(arg0);
                super.writeExecutor = super.queryExecutor = MoreExecutors.sameThreadExecutor(); // synchronous processing
            }
        };
        TFileImpl hdsFile = new TFileImpl.DefaultTFileImpl();
        hdsOut.setFileStore(hdsFile);
        hdsFile.setBasePath(testInfo.getDir());
        EventSchema eventSchema = GenericAggregateSerializerTest.getEventSchema();
        GenericAggregator aggregator = new GenericAggregator(eventSchema);
        aggregator.init("time=MINUTES:pubId:adId:adUnit");
        hdsOut.setEventSchemaJSON(GenericAggregateSerializerTest.TEST_SCHEMA_JSON);
        hdsOut.setAggregator(aggregator);
        hdsOut.setMaxCacheSize(1);
        hdsOut.setFlushIntervalCount(0);
        hdsOut.setup(null);

        CollectorTestSink<DimensionStoreOperator.HDSRangeQueryResult> queryResults = new CollectorTestSink<DimensionStoreOperator.HDSRangeQueryResult>();
        @SuppressWarnings({ "unchecked", "rawtypes" })
        CollectorTestSink<Object> tmp = (CollectorTestSink) queryResults;
        hdsOut.queryResult.setSink(tmp);

        hdsOut.beginWindow(1);

        long baseTime = System.currentTimeMillis();
        long baseMinute = TimeUnit.MILLISECONDS.convert(TimeUnit.MINUTES.convert(baseTime, TimeUnit.MILLISECONDS),
                TimeUnit.MINUTES);

        // Events ae1 and ae2 fall into same aggregation as they have same key
        Map<String, Object> eventMap = Maps.newHashMap();
        eventMap.put("timestamp", baseMinute);
        eventMap.put("pubId", 1);
        eventMap.put("adUnit", 3);
        eventMap.put("clicks", 10L);

        GenericAggregate ae1 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae1);

        // Modify click count and create new event
        eventMap.put("clicks", 20L);
        GenericAggregate ae2 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae2);

        // Modify clicks to 10 and time by 1 minute and create new event
        eventMap.put("timestamp", baseMinute + TimeUnit.MILLISECONDS.convert(1, TimeUnit.MINUTES));
        eventMap.put("clicks", 10L);
        GenericAggregate ae3 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae3);

        hdsOut.endWindow();

        hdsOut.beginWindow(2);

        JSONObject keys = new JSONObject();
        keys.put("pubId", 1);
        keys.put("adUnit", 3);

        JSONObject query = new JSONObject();
        query.put("numResults", "20");
        query.put("keys", keys);
        query.put("id", "query1");
        query.put("startTime", baseMinute);
        query.put("endTime", baseMinute + TimeUnit.MILLISECONDS.convert(20, TimeUnit.MINUTES));

        hdsOut.query.process(query.toString());

        Assert.assertEquals("timeSeriesQueries " + hdsOut.rangeQueries, 1, hdsOut.rangeQueries.size());
        DimensionStoreOperator.HDSRangeQuery aq = hdsOut.rangeQueries.values().iterator().next();
        Assert.assertEquals("numTimeUnits " + hdsOut.rangeQueries, baseMinute, aq.startTime);

        hdsOut.endWindow();

        Assert.assertEquals("queryResults " + queryResults.collectedTuples, 1, queryResults.collectedTuples.size());
        HDSRangeQueryResult r = queryResults.collectedTuples.iterator().next();
        Assert.assertEquals("result points " + r, 2, r.data.size());

        // ae1 object is stored as referenced in cache, and when new tuple is aggregated,
        // the new values are updated in ae1 itself, causing following check to fail.
        //Assert.assertEquals("clicks", ae1.clicks + ae2.clicks, r.data.get(0).clicks);
        Assert.assertEquals("clicks", 30L, r.data.get(0).get("clicks"));
        Assert.assertEquals("clicks", eventSchema.getValue(ae3, "clicks"), r.data.get(1).get("clicks"));
        // when data is returned from HDHT, all keys are part of response,
        // not present keys will have 0 values.
        Assert.assertEquals("from HDHT", 0, r.data.get(0).get("adId"));
        // when data is returned from Cache, not specified keys will not
        // be present in the map.
        Assert.assertEquals("from cache", 0, r.data.get(1).get("adId"));
    }

    @Test
    public void testQueryLessKeys() throws Exception {
        File file = new File(testInfo.getDir());
        FileUtils.deleteDirectory(file);

        DimensionStoreOperator hdsOut = new DimensionStoreOperator() {
            @Override
            public void setup(OperatorContext arg0) {
                super.setup(arg0);
                super.writeExecutor = super.queryExecutor = MoreExecutors.sameThreadExecutor(); // synchronous processing
            }
        };
        TFileImpl hdsFile = new TFileImpl.DefaultTFileImpl();
        hdsOut.setFileStore(hdsFile);
        hdsFile.setBasePath(testInfo.getDir());
        EventSchema eventSchema = GenericAggregateSerializerTest.getEventSchema();
        GenericAggregator aggregator = new GenericAggregator(eventSchema);
        aggregator.init("time=MINUTES:pubId:adId:adUnit");
        hdsOut.setEventSchemaJSON(GenericAggregateSerializerTest.TEST_SCHEMA_JSON);
        hdsOut.setAggregator(aggregator);
        hdsOut.setMaxCacheSize(100);
        hdsOut.setFlushIntervalCount(100);
        hdsOut.setup(null);

        CollectorTestSink<DimensionStoreOperator.HDSRangeQueryResult> queryResults = new CollectorTestSink<DimensionStoreOperator.HDSRangeQueryResult>();
        @SuppressWarnings({ "unchecked", "rawtypes" })
        CollectorTestSink<Object> tmp = (CollectorTestSink) queryResults;
        hdsOut.queryResult.setSink(tmp);

        hdsOut.beginWindow(1);

        long baseTime = System.currentTimeMillis();
        long baseMinute = TimeUnit.MILLISECONDS.convert(TimeUnit.MINUTES.convert(baseTime, TimeUnit.MILLISECONDS),
                TimeUnit.MINUTES);

        // Check aggregation for ae1 and ae2 as they have same key.
        Map<String, Object> eventMap = Maps.newHashMap();
        eventMap.put("timestamp", baseMinute);
        eventMap.put("pubId", 1);
        eventMap.put("adId", 2);
        eventMap.put("clicks", 10L);

        GenericAggregate ae1 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae1);

        // Modify click count and create new event
        eventMap.put("clicks", 20L);
        GenericAggregate ae2 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae2);

        // Modify clicks to 10 and time by 1 minute and create new event
        eventMap.put("timestamp", baseMinute + TimeUnit.MILLISECONDS.convert(1, TimeUnit.MINUTES));
        eventMap.put("clicks", 10L);
        GenericAggregate ae3 = new GenericAggregate(eventSchema.convertMapToGenericEvent(eventMap));
        hdsOut.input.process(ae3);

        hdsOut.endWindow();

        hdsOut.beginWindow(2);

        JSONObject keys = new JSONObject();
        keys.put("pubId", 1);
        keys.put("adId", 2);

        JSONObject query = new JSONObject();
        query.put("numResults", "20");
        query.put("keys", keys);
        query.put("id", "query1");
        query.put("startTime", baseMinute);
        query.put("endTime", baseMinute + TimeUnit.MILLISECONDS.convert(20, TimeUnit.MINUTES));

        hdsOut.query.process(query.toString());

        Assert.assertEquals("timeSeriesQueries " + hdsOut.rangeQueries, 1, hdsOut.rangeQueries.size());
        DimensionStoreOperator.HDSRangeQuery aq = hdsOut.rangeQueries.values().iterator().next();
        Assert.assertEquals("numTimeUnits " + hdsOut.rangeQueries, baseMinute, aq.startTime);

        hdsOut.endWindow();

        Assert.assertEquals("queryResults " + queryResults.collectedTuples, 1, queryResults.collectedTuples.size());
        HDSRangeQueryResult r = queryResults.collectedTuples.iterator().next();
        Assert.assertEquals("result points " + r, 2, r.data.size());

        // ae1 object is stored as referenced in cache, and when new tuple is aggregated,
        // the new values are updated in ae1 itself, causing following check to fail.
        //Assert.assertEquals("clicks", ae1.clicks + ae2.clicks, r.data.get(0).clicks);
        Assert.assertEquals("clicks", 30L, r.data.get(0).get("clicks"));
        Assert.assertEquals("clicks", eventSchema.getValue(ae3, "clicks"), r.data.get(1).get("clicks"));
    }

    /* Test if queries with different key combination works */
    @Test
    public void testQueryLessKeys1() throws Exception {
        File file = new File(testInfo.getDir());
        FileUtils.deleteDirectory(file);

        EventSchema eventSchema = GenericAggregateSerializerTest.getEventSchema();

        String[] dimensionSpecs = new String[] { "time=" + TimeUnit.MINUTES, "time=" + TimeUnit.MINUTES + ":adUnit",
                "time=" + TimeUnit.MINUTES + ":adId", "time=" + TimeUnit.MINUTES + ":pubId",
                "time=" + TimeUnit.MINUTES + ":adId:adUnit", "time=" + TimeUnit.MINUTES + ":pubId:adUnit",
                "time=" + TimeUnit.MINUTES + ":pubId:adId", "time=" + TimeUnit.MINUTES + ":pubId:adId:adUnit" };

        GenericAggregator[] aggregators = new GenericAggregator[dimensionSpecs.length];
        for (int i = dimensionSpecs.length; i-- > 0;) {
            GenericAggregator aggregator = new GenericAggregator(eventSchema);
            aggregator.init(dimensionSpecs[i]);
            aggregators[i] = aggregator;
        }

        DimensionStoreOperator hdsOut = new DimensionStoreOperator() {
            @Override
            public void setup(OperatorContext arg0) {
                super.setup(arg0);
                super.writeExecutor = super.queryExecutor = MoreExecutors.sameThreadExecutor(); // synchronous processing
            }
        };
        TFileImpl hdsFile = new TFileImpl.DefaultTFileImpl();
        hdsOut.setFileStore(hdsFile);
        hdsFile.setBasePath(testInfo.getDir());
        //GenericAggregator aggregator = new GenericAggregator(eventSchema);
        //aggregator.init("time=MINUTES:pubId:adId:adUnit");
        hdsOut.setEventSchemaJSON(GenericAggregateSerializerTest.TEST_SCHEMA_JSON);
        hdsOut.setAggregator(aggregators[0]);
        hdsOut.setMaxCacheSize(100);
        hdsOut.setFlushIntervalCount(100);
        hdsOut.setup(null);

        CollectorTestSink<DimensionStoreOperator.HDSRangeQueryResult> queryResults = new CollectorTestSink<DimensionStoreOperator.HDSRangeQueryResult>();
        @SuppressWarnings({ "unchecked", "rawtypes" })
        CollectorTestSink<Object> tmp = (CollectorTestSink) queryResults;
        hdsOut.queryResult.setSink(tmp);

        long baseTime = System.currentTimeMillis();
        long baseMinute = TimeUnit.MILLISECONDS.convert(TimeUnit.MINUTES.convert(baseTime, TimeUnit.MILLISECONDS),
                TimeUnit.MINUTES);

        int numMinutes = 5;
        int wid = 1;
        for (int i = 0; i < numMinutes; i++) {
            hdsOut.beginWindow(wid);

            long timestamp = baseMinute - TimeUnit.MINUTES.toMillis(i);

            // Check aggregation for ae1 and ae2 as they have same key.
            Map<String, Object> eventMap = Maps.newHashMap();
            eventMap.put("timestamp", timestamp);
            eventMap.put("pubId", 1);
            eventMap.put("adId", 2);
            eventMap.put("adUnit", 3);
            eventMap.put("clicks", 10L);

            GenericEvent ge = eventSchema.convertMapToGenericEvent(eventMap);

            int aggrIdx = 0;
            for (GenericAggregator aggregator : aggregators) {
                GenericAggregate aggr = aggregator.getGroup(ge, aggrIdx);
                aggregator.aggregate(aggr, ge);
                hdsOut.input.process(aggr);
                aggrIdx++;
            }
            hdsOut.endWindow();
            wid++;
        }

        hdsOut.beginWindow(wid);

        int pubId = 1;
        int adId = 2;
        int adUnit = 3;

        for (int i = 0; i < 8; i++) {
            JSONObject keys = new JSONObject();
            if ((i & 0x1) != 0)
                keys.put("pubId", pubId);
            if ((i & 0x2) != 0)
                keys.put("adId", adId);
            if ((i & 0x4) != 0)
                keys.put("adUnit", adUnit);

            JSONObject query = new JSONObject();
            query.put("keys", keys);
            query.put("id", "query" + i);
            query.put("startTime", baseMinute - TimeUnit.MILLISECONDS.convert(20, TimeUnit.MINUTES));
            query.put("endTime", baseMinute);

            hdsOut.query.process(query.toString());
        }
        Assert.assertEquals("timeSeriesQueries " + hdsOut.rangeQueries, 8, hdsOut.rangeQueries.size());
        hdsOut.endWindow();

        Assert.assertEquals("queryResults " + queryResults.collectedTuples, 8, queryResults.collectedTuples.size());
        System.out.println("basetime " + baseMinute);

        for (HDSRangeQueryResult r : queryResults.collectedTuples) {
            Assert.assertEquals("result points " + r, Math.min(numMinutes, 20), r.data.size());
            for (Object o : r.data)
                System.out.println(o);
        }
    }
}