gobblin.writer.AvroToParquetHdfsTimePartitionedWriterTest.java Source code

Introduction

Here is the source code for gobblin.writer.AvroToParquetHdfsTimePartitionedWriterTest.java
Source

/*
 * Copyright (C) 2014-2015 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package gobblin.writer;

import java.io.File;
import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.Path;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;

/**
 * Tests for {@link AvroToParquetHdfsTimePartitionedWriterTest}.
 */
public class AvroToParquetHdfsTimePartitionedWriterTest {

    private static final String SIMPLE_CLASS_NAME = AvroToParquetHdfsTimePartitionedWriterTest.class
            .getSimpleName();

    private static final String TEST_ROOT_DIR = SIMPLE_CLASS_NAME + "-test";
    private static final String STAGING_DIR = TEST_ROOT_DIR + Path.SEPARATOR + "staging";
    private static final String OUTPUT_DIR = TEST_ROOT_DIR + Path.SEPARATOR + "output";
    private static final String BASE_FILE_PATH = "base";
    private static final String FILE_NAME = SIMPLE_CLASS_NAME + "-name.parquet";
    private static final String PARTITION_COLUMN_NAME = "timestamp";
    private static final String WRITER_ID = "writer-1";

    private static final String AVRO_SCHEMA = "{" + "\"type\" : \"record\"," + "\"name\" : \"User\","
            + "\"namespace\" : \"example.avro\"," + "\"fields\" : [ {" + "\"name\" : \"" + PARTITION_COLUMN_NAME
            + "\"," + "\"type\" : \"long\"" + "} ]" + "}";

    private static Schema schema;
    private static DataWriter<GenericRecord> writer;

    @BeforeClass
    public static void setUp() throws IOException {
        File stagingDir = new File(STAGING_DIR);
        File outputDir = new File(OUTPUT_DIR);

        if (!stagingDir.exists()) {
            stagingDir.mkdirs();
        } else {
            FileUtils.deleteDirectory(stagingDir);
        }

        if (!outputDir.exists()) {
            outputDir.mkdirs();
        } else {
            FileUtils.deleteDirectory(outputDir);
        }

        schema = new Schema.Parser().parse(AVRO_SCHEMA);

        State properties = new State();
        properties.setProp(ConfigurationKeys.WRITER_PARTITION_COLUMN_NAME, PARTITION_COLUMN_NAME);
        properties.setProp(ConfigurationKeys.WRITER_BUFFER_SIZE, ConfigurationKeys.DEFAULT_BUFFER_SIZE);
        properties.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, ConfigurationKeys.LOCAL_FS_URI);
        properties.setProp(ConfigurationKeys.WRITER_STAGING_DIR, STAGING_DIR);
        properties.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, OUTPUT_DIR);
        properties.setProp(ConfigurationKeys.WRITER_FILE_PATH, BASE_FILE_PATH);
        properties.setProp(ConfigurationKeys.WRITER_FILE_NAME, FILE_NAME);

        // Build a writer to write test records
        writer = new AvroToParquetTimePartitionedWriterBuilder()
                .writeTo(Destination.of(Destination.DestinationType.HDFS, properties))
                .writeInFormat(WriterOutputFormat.PARQUET).withWriterId(WRITER_ID).withSchema(schema)
                .withBranches(1).forBranch(0).build();
    }

    @Test
    public void testWriter() throws IOException {

        // Write three records, each should be written to a different file
        GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(schema);

        // This timestamp corresponds to 2015/01/01
        genericRecordBuilder.set("timestamp", 1420099200000l);
        writer.write(genericRecordBuilder.build());

        // This timestamp corresponds to 2015/01/02
        genericRecordBuilder.set("timestamp", 1420185600000l);
        writer.write(genericRecordBuilder.build());

        // This timestamp corresponds to 2015/01/03
        genericRecordBuilder.set("timestamp", 1420272000000l);
        writer.write(genericRecordBuilder.build());

        // Check that the writer reports that 3 records have been written
        Assert.assertEquals(writer.recordsWritten(), 3);

        writer.close();
        writer.commit();

        // Check that 3 files were created
        Assert.assertEquals(FileUtils.listFiles(new File(TEST_ROOT_DIR), new String[] { "parquet" }, true).size(),
                3);

        // Check if each file exists, and in the correct location
        File baseOutputDir = new File(OUTPUT_DIR,
                BASE_FILE_PATH + Path.SEPARATOR + ConfigurationKeys.DEFAULT_WRITER_PARTITION_LEVEL);
        Assert.assertTrue(baseOutputDir.exists());

        File outputDir20150101 = new File(baseOutputDir,
                "2015" + Path.SEPARATOR + "01" + Path.SEPARATOR + "01" + Path.SEPARATOR + FILE_NAME);
        Assert.assertTrue(outputDir20150101.exists());

        File outputDir20150102 = new File(baseOutputDir,
                "2015" + Path.SEPARATOR + "01" + Path.SEPARATOR + "02" + Path.SEPARATOR + FILE_NAME);
        Assert.assertTrue(outputDir20150102.exists());

        File outputDir20150103 = new File(baseOutputDir,
                "2015" + Path.SEPARATOR + "01" + Path.SEPARATOR + "03" + Path.SEPARATOR + FILE_NAME);
        Assert.assertTrue(outputDir20150103.exists());
    }

    @AfterClass
    public static void tearDown() throws IOException {
        FileUtils.deleteDirectory(new File(TEST_ROOT_DIR));
    }
}