Java tutorial
/*------------------------------------------------------------------------- * * TestConnectorUtil * * Copyright (c) 2011 EMC Corporation All Rights Reserved * * This software is protected, without limitation, by copyright law * and international treaties. Use of this software and the intellectual * property contained therein is expressly limited to the terms and * conditions of the License Agreement under which it is provided by * or on behalf of EMC. * *------------------------------------------------------------------------- */ package com.emc.greenplum.gpdb.hdfsconnector; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.conf.Configuration; import static org.junit.Assert.*; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import com.emc.greenplum.gpdb.hadoop.formathandler.AvroFileReader; import com.emc.greenplum.gpdb.hadoop.formathandler.AvroFileWriter; import com.emc.greenplum.gpdb.hadoop.formathandler.GpdbParquetFileReader; import com.emc.greenplum.gpdb.hadoop.formathandler.GpdbParquetFileWriter; import com.emc.greenplum.gpdb.hadoop.io.GPDBWritable; import com.emc.greenplum.gpdb.hadoop.io.GPDBWritable.TypeMismatchException; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroupFactory; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; public class TestConnectorUtil { private static MiniDFSCluster cluster; /* * setup the cluster and upload test files */ @BeforeClass public static void setupBeforeClass() throws IllegalArgumentException, IOException { final Configuration conf = new Configuration(); cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).format(true).build(); cluster.getFileSystem().mkdirs(new Path("/tmp")); int hadoopPort = cluster.getNameNodePort(); String tmpDir = "hdfs://127.0.0.1:" + hadoopPort + "/tmp"; //decimal.pq : file generated by hive using parquet format, contains two //columns of decimal Path decimal = new Path(tmpDir + "/decimal.pq"); cluster.getFileSystem().copyFromLocalFile( new Path((new File("")).getAbsolutePath() + "/src/test/data/decimal.pq"), decimal); //alertlog.avro : file contains unicode text Path alertlog = new Path(tmpDir + "/alertlog.avro"); cluster.getFileSystem().copyFromLocalFile( new Path((new File("")).getAbsolutePath() + "/src/test/data/alertlog.avro"), alertlog); //short.avro : file contains only one short column Path shortAvro = new Path(tmpDir + "/short.avro"); cluster.getFileSystem().copyFromLocalFile( new Path((new File("")).getAbsolutePath() + "/src/test/data/short.avro"), shortAvro); //short.parquet : file contains one line of (smallint, int, bigint) Path shortParquet = new Path(tmpDir + "/short.parquet"); cluster.getFileSystem().copyFromLocalFile( new Path((new File("")).getAbsolutePath() + "/src/test/data/short.parquet"), shortParquet); } /* * shutdown the cluster */ @AfterClass public static void teardownAfterClass() throws Exception { if (cluster != null) cluster.shutdown(); } /* * test set for fs.defaultFS */ @Test public void test_should_able_to_connect_to_hdfs() throws URISyntaxException, IOException { Configuration conf = new Configuration(); URI inputURI = new URI("gphdfs://localhost:9000/test.txt"); ConnectorUtil.setHadoopFSURI(conf, inputURI, "cdh4.1"); assertEquals("hdfs://localhost:9000", conf.get("fs.defaultFS")); } /* * make sure all the test files are already in hadoop */ @Test public void test_list_file() throws FileNotFoundException, IllegalArgumentException, IOException { ArrayList<DataNode> dns = cluster.getDataNodes(); assertEquals(dns.size(), 1); int fileNum = 0; RemoteIterator<LocatedFileStatus> fsIterator = cluster.getFileSystem().listFiles(new Path("/"), true); while (fsIterator.hasNext()) { fileNum++; System.out.println(fsIterator.next().getPath()); } assertEquals(fileNum, 4); } /* * test avro insert null (smallint, bytea) */ @Test public void test_avro_null() { AvroFileWriter aWriter = new AvroFileWriter(); GPDBWritable gWritable = new GPDBWritable(new int[] { GPDBWritable.SMALLINT, GPDBWritable.BYTEA }); try { String avroSchema = "{\"type\":\"record\",\"name\":\"test\",\"fields\":[" + "{\"name\":\"c1\",\"type\":[\"int\",\"null\"]}," + "{\"name\":\"c2\",\"type\":[\"bytes\",\"null\"]}]}"; Schema schema = new Schema.Parser().parse(avroSchema); GenericRecord record = new GenericData.Record(schema); gWritable.setShort(0, null); gWritable.setBytes(1, null); aWriter.fillRecord(record, gWritable, schema); assertEquals(record.get(0), null); assertEquals(record.get(1), null); } catch (TypeMismatchException e) { fail(e.getMessage()); } catch (IOException e) { fail(e.getMessage()); } } /* * test parquet insert null */ @Test public void test_parquet_null() { MessageType schema = MessageTypeParser .parseMessageType("message test { " + "required int32 i32; " + "required boolean boo; " + "} "); GpdbParquetFileWriter gWriter = new GpdbParquetFileWriter(); SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema); Group pqGroup = groupFactory.newGroup(); GPDBWritable gWritable = new GPDBWritable(new int[] { GPDBWritable.INTEGER, GPDBWritable.BOOLEAN }); try { gWritable.setInt(0, null); gWritable.setBoolean(1, true); gWriter.fillRecord(pqGroup, gWritable, schema); assertEquals(pqGroup.getFieldRepetitionCount(0), 0); assertEquals(pqGroup.getBoolean(1, 0), true); } catch (TypeMismatchException e) { fail(e.getMessage()); } catch (IOException e) { fail(e.getMessage()); } } /* * test unicode support */ @Test public void test_unicode() { int hadoopPort = cluster.getNameNodePort(); Configuration conf = new Configuration(); conf.addResource("hdfs-site.xml"); try { URI uri = new URI("gphdfs://localhost:" + hadoopPort + "/tmp/alertlog.avro"); ConnectorUtil.setHadoopFSURI(conf, uri, "gphdfs"); ByteArrayOutputStream bout = new ByteArrayOutputStream(); AvroFileReader aReader = new AvroFileReader(conf, 0, 1, uri.getPath(), null, null, false, false, bout); aReader.readAvroFormat(); byte[] barray = bout.toByteArray(); int line = 0; DataInputStream din = new DataInputStream(new ByteArrayInputStream(barray)); while (din.available() != 0) { GPDBWritable writable = new GPDBWritable(); writable.readFields(din); line++; } assertEquals(line, 1943); } catch (IOException e) { fail(e.getMessage()); } catch (URISyntaxException e) { fail(e.getMessage()); } } /* * test parquet short support * schema for short.parquet * message parquet5w { * optional int32 c1 (INT_16); * optional int32 c2; * optional int64 c3; * } */ @Test public void test_parquet_short() { int hadoopPort = cluster.getNameNodePort(); Configuration conf = new Configuration(); conf.addResource("hdfs-site.xml"); try { URI uri = new URI("gphdfs://localhost:" + hadoopPort + "/tmp/short.parquet"); ConnectorUtil.setHadoopFSURI(conf, uri, "gphdfs"); ByteArrayOutputStream bout = new ByteArrayOutputStream(); List<ColumnSchema> columns = new ArrayList<ColumnSchema>(); columns.add(new ColumnSchema("c1", GPDBWritable.SMALLINT, 1, 0, ',')); columns.add(new ColumnSchema("c2", GPDBWritable.SMALLINT, 1, 0, ',')); columns.add(new ColumnSchema("c2", GPDBWritable.BIGINT, 1, 0, ',')); GpdbParquetFileReader pReader = new GpdbParquetFileReader(conf, 0, 1, uri.getPath(), columns, false, false, bout); pReader.readParquetFormat(); byte[] barray = bout.toByteArray(); DataInputStream din = new DataInputStream(new ByteArrayInputStream(barray)); GPDBWritable writable = new GPDBWritable(); writable.readFields(din); short c1 = writable.getShort(0); assertEquals(c1, 123); short c2 = writable.getShort(1); assertEquals(c2, 1234); long c3 = writable.getLong(2); assertEquals(c3, 12345); assertEquals(din.available(), 0); } catch (IOException e) { fail(e.getMessage()); } catch (URISyntaxException e) { fail(e.getMessage()); } } /* * test avro short support */ @Test public void test_avro_short() { int hadoopPort = cluster.getNameNodePort(); Configuration conf = new Configuration(); conf.addResource("hdfs-site.xml"); try { URI uri = new URI("gphdfs://localhost:" + hadoopPort + "/tmp/short.avro"); ConnectorUtil.setHadoopFSURI(conf, uri, "gphdfs"); ByteArrayOutputStream bout = new ByteArrayOutputStream(); List<ColumnSchema> columns = new ArrayList<ColumnSchema>(); columns.add(new ColumnSchema("c1", GPDBWritable.SMALLINT, 1, 1, ',')); AvroFileReader aReader = new AvroFileReader(conf, 0, 1, uri.getPath(), columns, null, false, false, bout); aReader.readAvroFormat(); byte[] barray = bout.toByteArray(); DataInputStream din = new DataInputStream(new ByteArrayInputStream(barray)); GPDBWritable writable = new GPDBWritable(); writable.readFields(din); short c1 = writable.getShort(0); assertEquals(c1, 123); assertEquals(din.available(), 0); } catch (IOException e) { fail(e.getMessage()); } catch (URISyntaxException e) { fail(e.getMessage()); } } /* * test support for decimal in parquet file generated by hive */ @Test public void test_hive_parquet_decimal() { Configuration conf = new Configuration(); try { int hadoopPort = cluster.getNameNodePort(); URI uri = new URI("gphdfs://localhost:" + hadoopPort + "/tmp/decimal.pq"); ConnectorUtil.setHadoopFSURI(conf, uri, "gphdfs"); String inputPath = uri.getPath(); ByteArrayOutputStream bout = new ByteArrayOutputStream(); GpdbParquetFileReader pReader = new GpdbParquetFileReader(conf, 0, 1, inputPath, null, false, false, bout); pReader.readParquetFormat(); byte[] barray = bout.toByteArray(); //this is the standard result which gphdfs writes to gpdb byte[] expect = { 0, 0, 0, 32, 0, 1, 0, 2, 7, 7, 0, 0, 0, 0, 0, 7, 49, 50, 51, 46, 50, 49, 0, 0, 0, 0, 0, 4, 51, 46, 49, 0 }; for (int i = 0; i < barray.length; i++) { assertEquals(barray[i], expect[i]); } DataInputStream din = new DataInputStream(new ByteArrayInputStream(barray)); GPDBWritable writable = new GPDBWritable(); writable.readFields(din); String c1 = writable.getString(0); String c2 = writable.getString(1); assertEquals(c1, "123.21"); assertEquals(c2, "3.1"); assertEquals(din.available(), 0); } catch (IOException e) { fail(e.getMessage()); } catch (URISyntaxException e) { fail(e.getMessage()); } } //@Test public void test_should_able_to_connect_to_hdfs_with_ha() throws URISyntaxException { Configuration conf = new Configuration(); URI inputURI = new URI("gphdfs://nameservice1/test.txt"); ConnectorUtil.setHadoopFSURI(conf, inputURI, "cdh4.1"); assertEquals("hdfs://nameservice1", conf.get("fs.defaultFS")); } }