co.cask.cdap.data.stream.StreamInputFormatTest.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.cdap.data.stream.StreamInputFormatTest.java

Source

/*
 * Copyright  2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package co.cask.cdap.data.stream;

import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.format.FormatSpecification;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.flow.flowlet.StreamEvent;
import co.cask.cdap.api.stream.GenericStreamEventData;
import co.cask.cdap.api.stream.StreamEventData;
import co.cask.cdap.api.stream.StreamEventDecoder;
import co.cask.cdap.data.stream.decoder.BytesStreamEventDecoder;
import co.cask.cdap.data.stream.decoder.IdentityStreamEventDecoder;
import co.cask.cdap.data.stream.decoder.StringStreamEventDecoder;
import co.cask.cdap.data.stream.decoder.TextStreamEventDecoder;
import co.cask.cdap.format.TextRecordFormat;
import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContextImpl;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.junit.Assert;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

/**
 *
 */
public class StreamInputFormatTest {

    @ClassRule
    public static TemporaryFolder tmpFolder = new TemporaryFolder();

    private static final long CURRENT_TIME = 2000;

    @Test
    public void testTTL() throws Exception {
        File inputDir = tmpFolder.newFolder();
        File outputDir = tmpFolder.newFolder();

        outputDir.delete();

        final long currentTime = CURRENT_TIME;
        final long ttl = 1500;

        // Write 500 events in one bucket under one partition, with timestamps 0..499 by 1
        // This partition file should be skipped by AbstractStreamFileConsumerFactory
        generateEvents(inputDir, 500, 0, 1, new GenerateEvent() {
            @Override
            public String generate(int index, long timestamp) {
                return "expiredEvent " + timestamp;
            }
        });

        // Write 1000 events in one bucket under a different partition, with timestamps 0..999 by 1
        generateEvents(inputDir, 1000, 0, 1, new GenerateEvent() {
            @Override
            public String generate(int index, long timestamp) {
                if (timestamp + ttl < currentTime) {
                    return "expiredEvent " + timestamp;
                } else {
                    return "nonExpiredEvent " + timestamp;
                }
            }
        });

        // Write 1000 events in one bucket under a different partition, with timestamps 1000..1999 by 1
        generateEvents(inputDir, 1000, 1000, 1, new GenerateEvent() {
            @Override
            public String generate(int index, long timestamp) {
                return "nonExpiredEvent " + timestamp;
            }
        });

        // Run MR with TTL = 1500, currentTime = CURRENT_TIME
        runMR(inputDir, outputDir, 0, Long.MAX_VALUE, 2000, ttl);

        // Verify the result. It should have 1500 "nonExpiredEvent {timestamp}" for timestamp 500..1999 by 1.
        Map<String, Integer> output = loadMRResult(outputDir);
        Assert.assertEquals(ttl + 1, output.size());
        Assert.assertEquals(null, output.get("expiredEvent"));
        Assert.assertEquals(ttl, output.get("nonExpiredEvent").intValue());
        for (long i = (currentTime - ttl); i < currentTime; i++) {
            Assert.assertEquals(1, output.get(Long.toString(i)).intValue());
        }
    }

    @Test
    public void testTTLMultipleEventsWithSameTimestamp() throws Exception {
        File inputDir = tmpFolder.newFolder();
        File outputDir = tmpFolder.newFolder();

        outputDir.delete();

        final long currentTime = CURRENT_TIME;
        final long ttl = 1;

        // Write 1000 events in one bucket under one partition, with timestamp currentTime - ttl - 1
        generateEvents(inputDir, 1000, currentTime - ttl - 1, 0, new GenerateEvent() {
            @Override
            public String generate(int index, long timestamp) {
                return "expiredEvent " + timestamp;
            }
        });

        // Write 1000 events in one bucket under a different partition, with currentTime
        generateEvents(inputDir, 1000, currentTime, 0, new GenerateEvent() {
            @Override
            public String generate(int index, long timestamp) {
                return "nonExpiredEvent " + timestamp;
            }
        });

        runMR(inputDir, outputDir, 0, Long.MAX_VALUE, 2000, ttl);

        // Verify the result. It should have 1000 "nonExpiredEvent {currentTime}".
        Map<String, Integer> output = loadMRResult(outputDir);
        Assert.assertEquals(2, output.size());
        Assert.assertEquals(null, output.get("expiredEvent"));
        Assert.assertEquals(1000, output.get("nonExpiredEvent").intValue());
        Assert.assertEquals(1000, output.get(Long.toString(currentTime)).intValue());
    }

    @Test
    public void testAllEvents() throws Exception {
        // Write 1000 events in one bucket under one partition.
        File inputDir = tmpFolder.newFolder();
        File outputDir = tmpFolder.newFolder();

        outputDir.delete();

        generateEvents(inputDir);
        runMR(inputDir, outputDir, 0, Long.MAX_VALUE, 1000, Long.MAX_VALUE);

        // Verify the result. It should have 1000 "testing", and 100 for each integers in 0..9.
        Map<String, Integer> output = loadMRResult(outputDir);
        Assert.assertEquals(11, output.size());
        Assert.assertEquals(1000, output.get("Testing").intValue());
        for (int i = 0; i < 10; i++) {
            Assert.assertEquals(100, output.get(Integer.toString(i)).intValue());
        }
    }

    @Test
    public void testTimeRange() throws Exception {
        // Write 1000 events in one bucket under one partition.
        File inputDir = tmpFolder.newFolder();
        File outputDir = tmpFolder.newFolder();

        outputDir.delete();

        generateEvents(inputDir);
        // Run a MapReduce on 1 timestamp only.
        runMR(inputDir, outputDir, 1401, 1402, 1000, Long.MAX_VALUE);

        // Verify the result. It should have 1 "testing", and 1 "1".
        Map<String, Integer> output = loadMRResult(outputDir);
        Assert.assertEquals(2, output.size());
        Assert.assertEquals(1, output.get("Testing").intValue());
        Assert.assertEquals(1, output.get("1").intValue());
    }

    @Test
    public void testLiveStream() throws Exception {
        File inputDir = tmpFolder.newFolder();
        File outputDir = tmpFolder.newFolder();

        outputDir.delete();

        // Write 2 events, and keep the writer open
        File partition = new File(inputDir, "0.1000");
        File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
        File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());

        partition.mkdirs();

        StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
                Files.newOutputStreamSupplier(indexFile), 100L);

        writer.append(StreamFileTestUtils.createEvent(0, "Testing 0"));
        writer.append(StreamFileTestUtils.createEvent(1, "Testing 1"));

        writer.flush();

        // Run MapReduce to process all data.
        runMR(inputDir, outputDir, 0, Long.MAX_VALUE, 1000, Long.MAX_VALUE);
        Map<String, Integer> output = loadMRResult(outputDir);

        Assert.assertEquals(3, output.size());
        Assert.assertEquals(2, output.get("Testing").intValue());
        Assert.assertEquals(1, output.get("0").intValue());
        Assert.assertEquals(1, output.get("1").intValue());
    }

    @Test
    public void testIdentityStreamEventDecoder() {
        ImmutableMap.Builder<String, String> headers = ImmutableMap.builder();
        headers.put("key1", "value1");
        headers.put("key2", "value2");
        ByteBuffer buffer = Charsets.UTF_8.encode("testdata");
        StreamEvent event = new StreamEvent(headers.build(), buffer, System.currentTimeMillis());
        StreamEventDecoder<LongWritable, StreamEvent> decoder = new IdentityStreamEventDecoder();
        StreamEventDecoder.DecodeResult<LongWritable, StreamEvent> result = new StreamEventDecoder.DecodeResult<>();
        result = decoder.decode(event, result);
        Assert.assertEquals(new LongWritable(event.getTimestamp()), result.getKey());
        Assert.assertEquals(event, result.getValue());
    }

    @Test
    public void testStringStreamEventDecoder() {
        String body = "Testing";
        StreamEvent event = new StreamEvent(ImmutableMap.<String, String>of(), Charsets.UTF_8.encode(body));
        StreamEventDecoder<LongWritable, String> decoder = new StringStreamEventDecoder();
        StreamEventDecoder.DecodeResult<LongWritable, String> result = new StreamEventDecoder.DecodeResult<>();
        result = decoder.decode(event, result);

        Assert.assertEquals(event.getTimestamp(), result.getKey().get());
        Assert.assertEquals(body, result.getValue());
    }

    @Test
    public void testStreamDecoderInference() {
        Configuration conf = new Configuration();
        StreamInputFormat.inferDecoderClass(conf, BytesWritable.class);
        Assert.assertEquals(BytesStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
        StreamInputFormat.inferDecoderClass(conf, Text.class);
        Assert.assertEquals(TextStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
        StreamInputFormat.inferDecoderClass(conf, String.class);
        Assert.assertEquals(StringStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
        StreamInputFormat.inferDecoderClass(conf, StreamEvent.class);
        Assert.assertEquals(IdentityStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
        StreamInputFormat.inferDecoderClass(conf, StreamEventData.class);
        Assert.assertEquals(IdentityStreamEventDecoder.class, StreamInputFormat.getDecoderClass(conf));
    }

    @Test
    public void testStreamRecordReader() throws Exception {
        File inputDir = tmpFolder.newFolder();
        File partition = new File(inputDir, "1.1000");
        partition.mkdirs();
        File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
        File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());

        // write 1 event
        StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
                Files.newOutputStreamSupplier(indexFile), 100L);
        writer.append(StreamFileTestUtils.createEvent(1000, "test"));
        writer.flush();

        // get splits from the input format. Expect to get 2 splits,
        // one from 0 - some offset and one from offset - Long.MAX_VALUE.
        Configuration conf = new Configuration();
        TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
        StreamInputFormat.setStreamPath(conf, inputDir.toURI());
        StreamInputFormat format = new StreamInputFormat();
        List<InputSplit> splits = format.getSplits(new JobContextImpl(new JobConf(conf), new JobID()));
        Assert.assertEquals(2, splits.size());

        // write another event so that the 2nd split has something to read
        writer.append(StreamFileTestUtils.createEvent(1001, "test"));
        writer.close();

        // create a record reader for the 2nd split
        StreamRecordReader<LongWritable, StreamEvent> recordReader = new StreamRecordReader<>(
                new IdentityStreamEventDecoder());
        recordReader.initialize(splits.get(1), context);

        // check that we read the 2nd stream event
        Assert.assertTrue(recordReader.nextKeyValue());
        StreamEvent output = recordReader.getCurrentValue();
        Assert.assertEquals(1001, output.getTimestamp());
        Assert.assertEquals("test", Bytes.toString(output.getBody()));
        // check that there is nothing more to read
        Assert.assertFalse(recordReader.nextKeyValue());
    }

    @Test
    public void testFormatStreamRecordReader() throws IOException, InterruptedException {
        File inputDir = tmpFolder.newFolder();
        File partition = new File(inputDir, "1.1000");
        partition.mkdirs();
        File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
        File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());

        // write 1 event
        StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
                Files.newOutputStreamSupplier(indexFile), 100L);

        StreamEvent streamEvent = new StreamEvent(ImmutableMap.of("header1", "value1", "header2", "value2"),
                Charsets.UTF_8.encode("hello world"), 1000);
        writer.append(streamEvent);
        writer.close();

        FormatSpecification formatSpec = new FormatSpecification(TextRecordFormat.class.getName(),
                Schema.recordOf("event", Schema.Field.of("body", Schema.of(Schema.Type.STRING))),
                Collections.<String, String>emptyMap());
        Configuration conf = new Configuration();
        StreamInputFormat.setBodyFormatSpecification(conf, formatSpec);
        StreamInputFormat.setStreamPath(conf, inputDir.toURI());
        TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

        StreamInputFormat format = new StreamInputFormat();

        // read all splits and store the results in the list
        List<GenericStreamEventData<StructuredRecord>> recordsRead = Lists.newArrayList();
        List<InputSplit> inputSplits = format.getSplits(context);
        for (InputSplit split : inputSplits) {
            RecordReader<LongWritable, GenericStreamEventData<StructuredRecord>> recordReader = format
                    .createRecordReader(split, context);
            recordReader.initialize(split, context);
            while (recordReader.nextKeyValue()) {
                recordsRead.add(recordReader.getCurrentValue());
            }
        }

        // should only have read 1 record
        Assert.assertEquals(1, recordsRead.size());
        GenericStreamEventData<StructuredRecord> eventData = recordsRead.get(0);
        Assert.assertEquals(streamEvent.getHeaders(), eventData.getHeaders());
        Assert.assertEquals("hello world", eventData.getBody().get("body"));
    }

    private void generateEvents(File inputDir, int numEvents, long startTime, long timeIncrement,
            GenerateEvent generator) throws IOException {
        File partition = new File(inputDir, Long.toString(startTime / 1000) + ".1000");
        File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
        File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());

        partition.mkdirs();

        StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
                Files.newOutputStreamSupplier(indexFile), 100L);
        // Write 1000 events
        for (int i = 0; i < numEvents; i++) {
            long timestamp = startTime + i * timeIncrement;
            writer.append(StreamFileTestUtils.createEvent(timestamp, generator.generate(i, timestamp)));
        }

        writer.close();
    }

    private void generateEvents(File inputDir) throws IOException {
        generateEvents(inputDir, 1000, 1000, 1, new GenerateEvent() {
            @Override
            public String generate(int index, long timestamp) {
                return "Testing " + (index % 10);
            }
        });
    }

    private void runMR(File inputDir, File outputDir, long startTime, long endTime, long splitSize, long ttl)
            throws Exception {

        Job job = Job.getInstance();
        Configuration conf = job.getConfiguration();

        StreamInputFormat.setTTL(conf, ttl);
        StreamInputFormat.setStreamPath(conf, inputDir.toURI());
        StreamInputFormat.setTimeRange(conf, startTime, endTime);
        StreamInputFormat.setMaxSplitSize(conf, splitSize);
        job.setInputFormatClass(TestStreamInputFormat.class);

        TextOutputFormat.setOutputPath(job, new Path(outputDir.toURI()));
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setJarByClass(StreamInputFormatTest.class);
        job.setMapperClass(TokenizeMapper.class);
        job.setReducerClass(AggregateReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.waitForCompletion(true);
    }

    private Map<String, Integer> loadMRResult(File outputDir) throws IOException {
        Map<String, Integer> output = Maps.newTreeMap();
        BufferedReader reader = Files.newReader(new File(outputDir, "part-r-00000"), Charsets.UTF_8);
        try {
            String line = reader.readLine();
            while (line != null) {
                int idx = line.indexOf('\t');
                output.put(line.substring(0, idx), Integer.parseInt(line.substring(idx + 1)));
                line = reader.readLine();
            }
        } finally {
            reader.close();
        }
        return output;
    }

    private interface GenerateEvent {
        String generate(int index, long timestamp);
    }

    /**
     * StreamInputFormat for testing.
     */
    private static final class TestStreamInputFormat extends StreamInputFormat<LongWritable, Text> {

        @Override
        protected StreamEventDecoder<LongWritable, Text> createStreamEventDecoder(Configuration conf) {
            return new TextStreamEventDecoder();
        }

        @Override
        protected long getCurrentTime() {
            return CURRENT_TIME;
        }
    }

    /**
     * Mapper for testing.
     */
    public static final class TokenizeMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private static final IntWritable ONE = new IntWritable(1);
        private final Text word = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, ONE);
            }
        }
    }

    /**
     * Reducer for testing.
     */
    public static final class AggregateReducer extends Reducer<Text, IntWritable, Text, LongWritable> {

        private final LongWritable result = new LongWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }
}