Java tutorial
/** * Copyright 2016 Otto (GmbH & Co KG) * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.schedoscope.export.kafka; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Charsets; import com.google.common.collect.ImmutableSet; import com.lambdanow.avro.schema.MemorySchemaRegistry; import com.lambdanow.avro.schema.SchemaRegistry; import com.lambdanow.avro.serde.FingerprintSerdeGeneric; import kafka.utils.ZKStringSerializer$; import org.I0Itec.zkclient.ZkClient; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroValue; import org.apache.avro.mapreduce.AvroJob; import org.apache.curator.test.TestingServer; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hive.hcatalog.mapreduce.HCatInputFormat; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.schedoscope.export.BaseExportJob; import org.schedoscope.export.HiveUnitBaseTest; import org.schedoscope.export.kafka.avro.HCatToAvroSchemaConverter; import org.schedoscope.export.kafka.options.CleanupPolicy; import org.schedoscope.export.kafka.options.CompressionCodec; import org.schedoscope.export.kafka.options.OutputEncoding; import org.schedoscope.export.kafka.options.ProducerType; import org.schedoscope.export.kafka.outputformat.KafkaOutputFormat; import org.schedoscope.export.testsupport.EmbeddedKafkaCluster; import org.schedoscope.export.testsupport.SimpleTestKafkaConsumer; import java.util.ArrayList; import java.util.Properties; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class KafkaExportMRTest extends HiveUnitBaseTest { protected EmbeddedKafkaCluster kafka; protected TestingServer zkServer; protected SimpleTestKafkaConsumer kafkaConsumer; protected ZkClient zkClient; private static final String TEST_DATABASE = "default"; private static final String TEST_TABLE = "test_table"; private static final int TEST_SIZE = 10; @Override @Before public void setUp() throws Exception { super.setUp(); zkServer = new TestingServer(2182); zkServer.start(); Thread.sleep(1000); zkClient = new ZkClient(zkServer.getConnectString(), 30000, 30000, ZKStringSerializer$.MODULE$); startKafkaServer(); kafkaConsumer = new SimpleTestKafkaConsumer(TEST_DATABASE + "_" + TEST_TABLE, zkServer.getConnectString(), 10); } @Override @After public void tearDown() throws Exception { super.tearDown(); kafkaConsumer.shutdown(); stopKafkaServer(); zkServer.close(); } @Test public void testKafkaMapExportString() throws Exception { setUpHiveServer("src/test/resources/test_map_data.txt", "src/test/resources/test_map.hql", "test_map"); Job job = Job.getInstance(conf); HCatToAvroSchemaConverter schemaConverter = new HCatToAvroSchemaConverter(); Schema schema = schemaConverter.convertSchema(hcatInputSchema, "MyTable"); AvroJob.setMapOutputValueSchema(job, schema); KafkaOutputFormat.setOutput(job.getConfiguration(), "localhost:9092", zkServer.getConnectString(), ProducerType.sync, CleanupPolicy.delete, "id", TEST_TABLE, TEST_DATABASE, 1, 1, CompressionCodec.gzip, OutputEncoding.string); job.setMapperClass(KafkaExportMapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(HCatInputFormat.class); job.setOutputFormatClass(KafkaOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(AvroValue.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); assertTrue(job.waitForCompletion(true)); ObjectMapper objMapper = new ObjectMapper(); int counter = 0; for (byte[] message : kafkaConsumer) { counter++; String record = new String(message, Charsets.UTF_8); JsonNode data = objMapper.readTree(record); assertEquals("value1", data.get("created_by").asText()); } assertEquals(TEST_SIZE, counter); } @Test public void testKafkaMapExportAvro() throws Exception { setUpHiveServer("src/test/resources/test_map_data.txt", "src/test/resources/test_map.hql", "test_map"); Job job = Job.getInstance(conf); HCatToAvroSchemaConverter schemaConverter = new HCatToAvroSchemaConverter(); Schema schema = schemaConverter.convertSchema(hcatInputSchema, "MyTable"); AvroJob.setMapOutputValueSchema(job, schema); KafkaOutputFormat.setOutput(job.getConfiguration(), "localhost:9092", zkServer.getConnectString(), ProducerType.sync, CleanupPolicy.delete, "id", TEST_TABLE, TEST_DATABASE, 1, 1, CompressionCodec.gzip, OutputEncoding.avro); job.setMapperClass(KafkaExportMapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(HCatInputFormat.class); job.setOutputFormatClass(KafkaOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(AvroValue.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); assertTrue(job.waitForCompletion(true)); ObjectMapper objMapper = new ObjectMapper(); SchemaRegistry registry = new MemorySchemaRegistry(); registry.register(schema); FingerprintSerdeGeneric serde = new FingerprintSerdeGeneric(registry); int counter = 0; for (byte[] message : kafkaConsumer) { counter++; GenericRecord record = serde.fromBytes(message); JsonNode data = objMapper.readTree(record.toString()); assertEquals("value1", data.get("created_by").asText()); } assertEquals(TEST_SIZE, counter); } @Test public void testKafkaArrayStructExportAvro() throws Exception { setUpHiveServer("src/test/resources/test_arraystruct_data.txt", "src/test/resources/test_arraystruct.hql", "test_arraystruct"); Job job = Job.getInstance(conf); HCatToAvroSchemaConverter schemaConverter = new HCatToAvroSchemaConverter(); Schema schema = schemaConverter.convertSchema(hcatInputSchema, "MyTable"); AvroJob.setMapOutputValueSchema(job, schema); KafkaOutputFormat.setOutput(job.getConfiguration(), "localhost:9092", zkServer.getConnectString(), ProducerType.sync, CleanupPolicy.delete, "id", TEST_TABLE, TEST_DATABASE, 1, 1, CompressionCodec.gzip, OutputEncoding.avro); job.setMapperClass(KafkaExportMapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(HCatInputFormat.class); job.setOutputFormatClass(KafkaOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(AvroValue.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); assertTrue(job.waitForCompletion(true)); ObjectMapper objMapper = new ObjectMapper(); SchemaRegistry registry = new MemorySchemaRegistry(); registry.register(schema); FingerprintSerdeGeneric serde = new FingerprintSerdeGeneric(registry); int counter = 0; for (byte[] message : kafkaConsumer) { counter++; GenericRecord record = serde.fromBytes(message); JsonNode data = objMapper.readTree(record.toString()); assertEquals("value1", data.get("created_by").asText()); } assertEquals(TEST_SIZE, counter); } @Test public void testKafkaMapAnonymizedExport() throws Exception { setUpHiveServer("src/test/resources/test_array_data.txt", "src/test/resources/test_array.hql", "test_array"); String[] anonFields = new String[] { "month_id", "numcol1" }; conf.setStrings(BaseExportJob.EXPORT_ANON_FIELDS, anonFields); conf.set(BaseExportJob.EXPORT_ANON_SALT, "vD75MqvaasIlCf7H"); Job job = Job.getInstance(conf); HCatToAvroSchemaConverter schemaConverter = new HCatToAvroSchemaConverter(ImmutableSet.copyOf(anonFields)); Schema schema = schemaConverter.convertSchema(hcatInputSchema, "MyTable"); AvroJob.setMapOutputValueSchema(job, schema); KafkaOutputFormat.setOutput(job.getConfiguration(), "localhost:9092", zkServer.getConnectString(), ProducerType.sync, CleanupPolicy.delete, "id", TEST_TABLE, TEST_DATABASE, 1, 1, CompressionCodec.gzip, OutputEncoding.string); job.setMapperClass(KafkaExportMapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(HCatInputFormat.class); job.setOutputFormatClass(KafkaOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(AvroValue.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); assertTrue(job.waitForCompletion(true)); ObjectMapper objMapper = new ObjectMapper(); int counter = 0; for (byte[] message : kafkaConsumer) { counter++; String record = new String(message, Charsets.UTF_8); JsonNode data = objMapper.readTree(record); assertEquals("bc5a56c463b81d13bbf8bc519cc17eb2", data.get("numcol1").asText()); assertEquals(13, data.get("numcol2").asInt()); } assertEquals(TEST_SIZE, counter); } private void startKafkaServer() throws Exception { ArrayList<Integer> ports = new ArrayList<Integer>(); ports.add(9092); kafka = new EmbeddedKafkaCluster(zkServer.getConnectString(), new Properties(), ports); kafka.startup(); Thread.sleep(2000); } private void stopKafkaServer() throws Exception { kafka.shutdown(); Thread.sleep(1000); } }