com.emc.greenplum.gpdb.hdfsconnector.TestConnectorUtil.java Source code

Introduction

Here is the source code for com.emc.greenplum.gpdb.hdfsconnector.TestConnectorUtil.java
Source

/*-------------------------------------------------------------------------
 *
 * TestConnectorUtil
 *
 * Copyright (c) 2011 EMC Corporation All Rights Reserved
 *
 * This software is protected, without limitation, by copyright law
 * and international treaties. Use of this software and the intellectual
 * property contained therein is expressly limited to the terms and
 * conditions of the License Agreement under which it is provided by
 * or on behalf of EMC.
 *
 *-------------------------------------------------------------------------
 */

package com.emc.greenplum.gpdb.hdfsconnector;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.conf.Configuration;

import static org.junit.Assert.*;

import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import com.emc.greenplum.gpdb.hadoop.formathandler.AvroFileReader;
import com.emc.greenplum.gpdb.hadoop.formathandler.AvroFileWriter;
import com.emc.greenplum.gpdb.hadoop.formathandler.GpdbParquetFileReader;
import com.emc.greenplum.gpdb.hadoop.formathandler.GpdbParquetFileWriter;
import com.emc.greenplum.gpdb.hadoop.io.GPDBWritable;
import com.emc.greenplum.gpdb.hadoop.io.GPDBWritable.TypeMismatchException;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;

public class TestConnectorUtil {
    private static MiniDFSCluster cluster;

    /*
     * setup the cluster and upload test files
     */
    @BeforeClass
    public static void setupBeforeClass() throws IllegalArgumentException, IOException {
        final Configuration conf = new Configuration();
        cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).format(true).build();
        cluster.getFileSystem().mkdirs(new Path("/tmp"));

        int hadoopPort = cluster.getNameNodePort();
        String tmpDir = "hdfs://127.0.0.1:" + hadoopPort + "/tmp";

        //decimal.pq : file generated by hive using parquet format, contains two
        //columns of decimal
        Path decimal = new Path(tmpDir + "/decimal.pq");
        cluster.getFileSystem().copyFromLocalFile(
                new Path((new File("")).getAbsolutePath() + "/src/test/data/decimal.pq"), decimal);
        //alertlog.avro : file contains unicode text
        Path alertlog = new Path(tmpDir + "/alertlog.avro");
        cluster.getFileSystem().copyFromLocalFile(
                new Path((new File("")).getAbsolutePath() + "/src/test/data/alertlog.avro"), alertlog);
        //short.avro : file contains only one short column
        Path shortAvro = new Path(tmpDir + "/short.avro");
        cluster.getFileSystem().copyFromLocalFile(
                new Path((new File("")).getAbsolutePath() + "/src/test/data/short.avro"), shortAvro);
        //short.parquet : file contains one line of (smallint, int, bigint)
        Path shortParquet = new Path(tmpDir + "/short.parquet");
        cluster.getFileSystem().copyFromLocalFile(
                new Path((new File("")).getAbsolutePath() + "/src/test/data/short.parquet"), shortParquet);
    }

    /*
     * shutdown the cluster
     */
    @AfterClass
    public static void teardownAfterClass() throws Exception {
        if (cluster != null)
            cluster.shutdown();
    }

    /*
     * test set for fs.defaultFS
     */
    @Test
    public void test_should_able_to_connect_to_hdfs() throws URISyntaxException, IOException {
        Configuration conf = new Configuration();
        URI inputURI = new URI("gphdfs://localhost:9000/test.txt");

        ConnectorUtil.setHadoopFSURI(conf, inputURI, "cdh4.1");

        assertEquals("hdfs://localhost:9000", conf.get("fs.defaultFS"));
    }

    /*
     * make sure all the test files are already in hadoop
     */
    @Test
    public void test_list_file() throws FileNotFoundException, IllegalArgumentException, IOException {
        ArrayList<DataNode> dns = cluster.getDataNodes();

        assertEquals(dns.size(), 1);

        int fileNum = 0;
        RemoteIterator<LocatedFileStatus> fsIterator = cluster.getFileSystem().listFiles(new Path("/"), true);
        while (fsIterator.hasNext()) {
            fileNum++;
            System.out.println(fsIterator.next().getPath());
        }

        assertEquals(fileNum, 4);
    }

    /*
     * test avro insert null (smallint, bytea)
     */
    @Test
    public void test_avro_null() {
        AvroFileWriter aWriter = new AvroFileWriter();
        GPDBWritable gWritable = new GPDBWritable(new int[] { GPDBWritable.SMALLINT, GPDBWritable.BYTEA });
        try {
            String avroSchema = "{\"type\":\"record\",\"name\":\"test\",\"fields\":["
                    + "{\"name\":\"c1\",\"type\":[\"int\",\"null\"]},"
                    + "{\"name\":\"c2\",\"type\":[\"bytes\",\"null\"]}]}";
            Schema schema = new Schema.Parser().parse(avroSchema);
            GenericRecord record = new GenericData.Record(schema);
            gWritable.setShort(0, null);
            gWritable.setBytes(1, null);
            aWriter.fillRecord(record, gWritable, schema);

            assertEquals(record.get(0), null);
            assertEquals(record.get(1), null);
        } catch (TypeMismatchException e) {
            fail(e.getMessage());
        } catch (IOException e) {
            fail(e.getMessage());
        }
    }

    /*
     * test parquet insert null
     */
    @Test
    public void test_parquet_null() {
        MessageType schema = MessageTypeParser
                .parseMessageType("message test { " + "required int32 i32; " + "required boolean boo; " + "} ");

        GpdbParquetFileWriter gWriter = new GpdbParquetFileWriter();
        SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
        Group pqGroup = groupFactory.newGroup();
        GPDBWritable gWritable = new GPDBWritable(new int[] { GPDBWritable.INTEGER, GPDBWritable.BOOLEAN });
        try {
            gWritable.setInt(0, null);
            gWritable.setBoolean(1, true);
            gWriter.fillRecord(pqGroup, gWritable, schema);

            assertEquals(pqGroup.getFieldRepetitionCount(0), 0);
            assertEquals(pqGroup.getBoolean(1, 0), true);
        } catch (TypeMismatchException e) {
            fail(e.getMessage());
        } catch (IOException e) {
            fail(e.getMessage());
        }
    }

    /*
     * test unicode support
     */
    @Test
    public void test_unicode() {
        int hadoopPort = cluster.getNameNodePort();

        Configuration conf = new Configuration();
        conf.addResource("hdfs-site.xml");

        try {
            URI uri = new URI("gphdfs://localhost:" + hadoopPort + "/tmp/alertlog.avro");
            ConnectorUtil.setHadoopFSURI(conf, uri, "gphdfs");

            ByteArrayOutputStream bout = new ByteArrayOutputStream();
            AvroFileReader aReader = new AvroFileReader(conf, 0, 1, uri.getPath(), null, null, false, false, bout);

            aReader.readAvroFormat();

            byte[] barray = bout.toByteArray();

            int line = 0;
            DataInputStream din = new DataInputStream(new ByteArrayInputStream(barray));
            while (din.available() != 0) {
                GPDBWritable writable = new GPDBWritable();
                writable.readFields(din);
                line++;
            }

            assertEquals(line, 1943);
        } catch (IOException e) {
            fail(e.getMessage());
        } catch (URISyntaxException e) {
            fail(e.getMessage());
        }
    }

    /*
     * test parquet short support
     * schema for short.parquet
     * message parquet5w {  
     *     optional int32 c1 (INT_16);
     *     optional int32 c2;
     *     optional int64 c3;
     * }
     */
    @Test
    public void test_parquet_short() {
        int hadoopPort = cluster.getNameNodePort();

        Configuration conf = new Configuration();
        conf.addResource("hdfs-site.xml");

        try {
            URI uri = new URI("gphdfs://localhost:" + hadoopPort + "/tmp/short.parquet");
            ConnectorUtil.setHadoopFSURI(conf, uri, "gphdfs");

            ByteArrayOutputStream bout = new ByteArrayOutputStream();
            List<ColumnSchema> columns = new ArrayList<ColumnSchema>();
            columns.add(new ColumnSchema("c1", GPDBWritable.SMALLINT, 1, 0, ','));
            columns.add(new ColumnSchema("c2", GPDBWritable.SMALLINT, 1, 0, ','));
            columns.add(new ColumnSchema("c2", GPDBWritable.BIGINT, 1, 0, ','));
            GpdbParquetFileReader pReader = new GpdbParquetFileReader(conf, 0, 1, uri.getPath(), columns, false,
                    false, bout);
            pReader.readParquetFormat();

            byte[] barray = bout.toByteArray();
            DataInputStream din = new DataInputStream(new ByteArrayInputStream(barray));
            GPDBWritable writable = new GPDBWritable();
            writable.readFields(din);
            short c1 = writable.getShort(0);
            assertEquals(c1, 123);
            short c2 = writable.getShort(1);
            assertEquals(c2, 1234);
            long c3 = writable.getLong(2);
            assertEquals(c3, 12345);

            assertEquals(din.available(), 0);
        } catch (IOException e) {
            fail(e.getMessage());
        } catch (URISyntaxException e) {
            fail(e.getMessage());
        }
    }

    /*
     * test avro short support
     */
    @Test
    public void test_avro_short() {
        int hadoopPort = cluster.getNameNodePort();

        Configuration conf = new Configuration();
        conf.addResource("hdfs-site.xml");

        try {
            URI uri = new URI("gphdfs://localhost:" + hadoopPort + "/tmp/short.avro");
            ConnectorUtil.setHadoopFSURI(conf, uri, "gphdfs");

            ByteArrayOutputStream bout = new ByteArrayOutputStream();
            List<ColumnSchema> columns = new ArrayList<ColumnSchema>();
            columns.add(new ColumnSchema("c1", GPDBWritable.SMALLINT, 1, 1, ','));
            AvroFileReader aReader = new AvroFileReader(conf, 0, 1, uri.getPath(), columns, null, false, false,
                    bout);

            aReader.readAvroFormat();

            byte[] barray = bout.toByteArray();
            DataInputStream din = new DataInputStream(new ByteArrayInputStream(barray));
            GPDBWritable writable = new GPDBWritable();
            writable.readFields(din);
            short c1 = writable.getShort(0);
            assertEquals(c1, 123);

            assertEquals(din.available(), 0);
        } catch (IOException e) {
            fail(e.getMessage());
        } catch (URISyntaxException e) {
            fail(e.getMessage());
        }
    }

    /*
     * test support for decimal in parquet file generated by hive
     */
    @Test
    public void test_hive_parquet_decimal() {
        Configuration conf = new Configuration();

        try {
            int hadoopPort = cluster.getNameNodePort();
            URI uri = new URI("gphdfs://localhost:" + hadoopPort + "/tmp/decimal.pq");
            ConnectorUtil.setHadoopFSURI(conf, uri, "gphdfs");
            String inputPath = uri.getPath();

            ByteArrayOutputStream bout = new ByteArrayOutputStream();
            GpdbParquetFileReader pReader = new GpdbParquetFileReader(conf, 0, 1, inputPath, null, false, false,
                    bout);

            pReader.readParquetFormat();
            byte[] barray = bout.toByteArray();

            //this is the standard result which gphdfs writes to gpdb
            byte[] expect = { 0, 0, 0, 32, 0, 1, 0, 2, 7, 7, 0, 0, 0, 0, 0, 7, 49, 50, 51, 46, 50, 49, 0, 0, 0, 0,
                    0, 4, 51, 46, 49, 0 };
            for (int i = 0; i < barray.length; i++) {
                assertEquals(barray[i], expect[i]);
            }

            DataInputStream din = new DataInputStream(new ByteArrayInputStream(barray));
            GPDBWritable writable = new GPDBWritable();
            writable.readFields(din);
            String c1 = writable.getString(0);
            String c2 = writable.getString(1);
            assertEquals(c1, "123.21");
            assertEquals(c2, "3.1");

            assertEquals(din.available(), 0);
        } catch (IOException e) {
            fail(e.getMessage());
        } catch (URISyntaxException e) {
            fail(e.getMessage());
        }
    }

    //@Test
    public void test_should_able_to_connect_to_hdfs_with_ha() throws URISyntaxException {
        Configuration conf = new Configuration();
        URI inputURI = new URI("gphdfs://nameservice1/test.txt");

        ConnectorUtil.setHadoopFSURI(conf, inputURI, "cdh4.1");

        assertEquals("hdfs://nameservice1", conf.get("fs.defaultFS"));
    }

}