gobblin.writer.SimpleDataWriterTest.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.writer.SimpleDataWriterTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.writer;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Collections;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;

import com.google.common.collect.ImmutableList;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.crypto.EncryptionConfigParser;
import gobblin.crypto.EncryptionFactory;
import gobblin.metadata.types.GlobalMetadata;

/**
 * Unit tests for {@link SimpleDataWriter}.
 *
 * @author akshay@nerdwallet.com
 */
@Test(groups = { "gobblin.writer" })
public class SimpleDataWriterTest {
    private String filePath;
    private final String schema = "";
    private final int newLine = "\n".getBytes()[0];
    private State properties;
    private static final String ENCRYPT_PREFIX = "writer.encrypt.";

    @BeforeMethod
    public void setUp() throws Exception {
        properties = new State();

        // Making the staging and/or output dirs if necessary
        File stagingDir = new File(TestConstants.TEST_STAGING_DIR);
        File outputDir = new File(TestConstants.TEST_OUTPUT_DIR);
        if (!stagingDir.exists()) {
            stagingDir.mkdirs();
        }
        if (!outputDir.exists()) {
            outputDir.mkdirs();
        }

        this.filePath = TestConstants.TEST_EXTRACT_NAMESPACE.replaceAll("\\.", "/") + "/"
                + TestConstants.TEST_EXTRACT_TABLE + "/" + TestConstants.TEST_EXTRACT_ID + "_"
                + TestConstants.TEST_EXTRACT_PULL_TYPE;

        properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "\n");
        properties.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, TestConstants.TEST_FS_URI);
        properties.setProp(ConfigurationKeys.WRITER_STAGING_DIR, TestConstants.TEST_STAGING_DIR);
        properties.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, TestConstants.TEST_OUTPUT_DIR);
        properties.setProp(ConfigurationKeys.WRITER_FILE_PATH, this.filePath);
        properties.setProp(ConfigurationKeys.WRITER_FILE_NAME, TestConstants.TEST_FILE_NAME);
        properties.setProp(ConfigurationKeys.SIMPLE_WRITER_PREPEND_SIZE, false);
    }

    /**
     * Test writing records without a delimiter and make sure it works.
     * @throws IOException
     */
    @Test
    public void testWriteBytesNoDelim() throws IOException {
        properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "");
        // Build a writer to write test records
        SimpleDataWriter writer = buildSimpleDataWriter();
        byte[] rec1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
        byte[] rec2 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 };
        byte[] rec3 = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 };

        writer.write(rec1);
        writer.write(rec2);
        writer.write(rec3);

        writer.close();
        writer.commit();

        Assert.assertEquals(writer.recordsWritten(), 3);
        Assert.assertEquals(writer.bytesWritten(), rec1.length + rec2.length + rec3.length);

        File outputFile = new File(writer.getOutputFilePath());
        InputStream is = new FileInputStream(outputFile);
        int c, resNum = 0, resi = 0;
        byte[][] records = { rec1, rec2, rec3 };
        while ((c = is.read()) != -1) {
            if (resi >= records[resNum].length) {
                resNum++;
                resi = 0;
            }
            Assert.assertEquals(c, records[resNum][resi]);
            resi++;
        }
    }

    /**
     * Prepend the size to each record without delimiting the record. Each record
     * should be prepended by the size of that record and the bytes written should
     * include the prepended bytes.
     */
    @Test
    public void testPrependSizeWithoutDelimiter() throws IOException {
        properties.setProp(ConfigurationKeys.SIMPLE_WRITER_PREPEND_SIZE, true);
        properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "");
        SimpleDataWriter writer = buildSimpleDataWriter();
        byte[] rec1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
        byte[] rec2 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 };
        byte[] rec3 = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 };
        byte[][] records = { rec1, rec2, rec3 };

        writer.write(rec1);
        writer.write(rec2);
        writer.write(rec3);

        writer.close();
        writer.commit();

        Assert.assertEquals(writer.recordsWritten(), 3);
        Assert.assertEquals(writer.bytesWritten(), rec1.length + rec2.length + rec3.length + (Long.SIZE / 8 * 3));

        File outputFile = new File(writer.getOutputFilePath());
        DataInputStream dis = new DataInputStream(new FileInputStream(outputFile));
        for (int i = 0; i < 3; i++) {
            long size = dis.readLong();
            Assert.assertEquals(size, records[i].length);
            for (int j = 0; j < size; j++) {
                Assert.assertEquals(dis.readByte(), records[i][j]);
            }
        }
    }

    /**
     * Use the simple data writer to write random bytes to a file and ensure
     * they are the same when read back.
     *
     * @throws IOException
     */
    @Test
    public void testWriteRandomBytes() throws IOException {
        // Build a writer to write test records
        SimpleDataWriter writer = buildSimpleDataWriter();
        byte[] rec1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
        byte[] rec2 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 };
        byte[] rec3 = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 };

        writer.write(rec1);
        writer.write(rec2);
        writer.write(rec3);

        writer.close();
        writer.commit();

        Assert.assertEquals(writer.recordsWritten(), 3);
        Assert.assertEquals(writer.bytesWritten(), rec1.length + rec2.length + rec3.length + 3); // 3 bytes for newline character

        File outputFile = new File(writer.getOutputFilePath());
        InputStream is = new FileInputStream(outputFile);
        int c, resNum = 0, resi = 0;
        byte[][] records = { rec1, rec2, rec3 };
        while ((c = is.read()) != -1) {
            if (c != newLine) {
                Assert.assertEquals(c, records[resNum][resi]);
                resi++;
            } else {
                resNum++;
                resi = 0;
            }
        }
    }

    /**
     * Prepend the size to each record and delimit the record. Each record
     * should be prepended by the size of that record and the bytes written should
     * include the prepended bytes.
     */
    @Test
    public void testPrependSizeWithDelimiter() throws IOException {
        properties.setProp(ConfigurationKeys.SIMPLE_WRITER_PREPEND_SIZE, true);
        SimpleDataWriter writer = buildSimpleDataWriter();
        byte[] rec1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
        byte[] rec2 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 };
        byte[] rec3 = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 };
        byte[][] records = { rec1, rec2, rec3 };

        writer.write(rec1);
        writer.write(rec2);
        writer.write(rec3);

        writer.close();
        writer.commit();

        Assert.assertEquals(writer.recordsWritten(), 3);
        Assert.assertEquals(writer.bytesWritten(),
                rec1.length + rec2.length + rec3.length + (Long.SIZE / 8 * 3) + 3);

        File outputFile = new File(writer.getOutputFilePath());
        DataInputStream dis = new DataInputStream(new FileInputStream(outputFile));
        for (int i = 0; i < 3; i++) {
            long size = dis.readLong();
            Assert.assertEquals(size, records[i].length + 1);
            for (int j = 0; j < size - 1; j++) {
                Assert.assertEquals(dis.readByte(), records[i][j]);
            }
            Assert.assertEquals(dis.readByte(), '\n');
        }
    }

    @Test
    public void testSupportsGzip() throws IOException {
        properties.setProp(ConfigurationKeys.WRITER_CODEC_TYPE, "gzip");
        properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "");

        byte[] toWrite = new byte[] { 'a', 'b', 'c', 'd' };

        SimpleDataWriter writer = buildSimpleDataWriter();

        writer.write(toWrite);
        writer.close();
        writer.commit();

        File outputFile = new File(writer.getOutputFilePath());
        InputStream in = new GZIPInputStream(new FileInputStream(outputFile));

        byte[] contents = IOUtils.toByteArray(in);
        Assert.assertEquals(contents, toWrite, "Expected gzip'd content to be written out");
        Assert.assertTrue(outputFile.getName().endsWith(".gzip"), "Expected gzip'd file to end in .gzip");
    }

    @Test
    public void testSupportsGzipAndEncryption() throws IOException {
        final String ENCRYPTION_TYPE = "insecure_shift";
        final String COMPRESSION_TYPE = "gzip";

        properties.setProp(ConfigurationKeys.WRITER_CODEC_TYPE, COMPRESSION_TYPE);
        properties.setProp(ENCRYPT_PREFIX + EncryptionConfigParser.ENCRYPTION_ALGORITHM_KEY, ENCRYPTION_TYPE);
        properties.setProp(ConfigurationKeys.SIMPLE_WRITER_DELIMITER, "");

        byte[] toWrite = new byte[] { 'a', 'b', 'c', 'd' };

        SimpleDataWriter writer = buildSimpleDataWriter();

        writer.write(toWrite);
        writer.close();
        writer.commit();

        File outputFile = new File(writer.getOutputFilePath());
        Assert.assertTrue(outputFile.getName().endsWith("." + COMPRESSION_TYPE + "." + ENCRYPTION_TYPE),
                "Expected compression & encryption in file name!");

        InputStream decryptedFile = EncryptionFactory
                .buildStreamCryptoProvider(ENCRYPTION_TYPE, Collections.<String, Object>emptyMap())
                .decodeInputStream(new FileInputStream(outputFile));
        InputStream uncompressedFile = new GZIPInputStream(decryptedFile);

        byte[] contents = IOUtils.toByteArray(uncompressedFile);
        Assert.assertEquals(contents, toWrite, "expected to decode same contents");
    }

    /**
     * Use the simple writer to write json entries to a file and ensure that
     * they are the same when read back.
     *
     * @throws IOException
     */
    @Test
    public void testWrite() throws IOException {
        SimpleDataWriter writer = buildSimpleDataWriter();
        int totalBytes = 3; // 3 extra bytes for the newline character
        // Write all test records
        for (String record : TestConstants.JSON_RECORDS) {
            byte[] toWrite = record.getBytes();
            Assert.assertEquals(toWrite.length, record.length()); // ensure null byte does not get added to end
            writer.write(toWrite);
            totalBytes += toWrite.length;
        }

        writer.close();
        writer.commit();

        Assert.assertEquals(writer.recordsWritten(), 3);
        Assert.assertEquals(writer.bytesWritten(), totalBytes);

        File outputFile = new File(writer.getOutputFilePath());
        BufferedReader br = new BufferedReader(new FileReader(outputFile));
        String line;
        int lineNumber = 0;
        while ((line = br.readLine()) != null) {
            Assert.assertEquals(line, TestConstants.JSON_RECORDS[lineNumber]);
            lineNumber++;
        }
        br.close();
        Assert.assertEquals(lineNumber, 3);
    }

    private SimpleDataWriter buildSimpleDataWriter() throws IOException {
        SimpleDataWriterBuilder b = (SimpleDataWriterBuilder) new SimpleDataWriterBuilder()
                .writeTo(Destination.of(Destination.DestinationType.HDFS, properties))
                .writeInFormat(WriterOutputFormat.AVRO).withWriterId(TestConstants.TEST_WRITER_ID)
                .withSchema(this.schema).forBranch(0);

        return new SimpleDataWriter(b, properties);
    }

    /**
     * If the staging file exists, the simple data writer should overwrite its contents.
     *
     * @throws IOException
     */
    @Test
    public void testOverwriteExistingStagingFile() throws IOException {
        byte[] randomBytesStage = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 0 };
        byte[] randomBytesWrite = { 11, 12, 13, 14, 15 };
        Path stagingFile = new Path(TestConstants.TEST_STAGING_DIR + Path.SEPARATOR + this.filePath,
                TestConstants.TEST_FILE_NAME + "." + TestConstants.TEST_WRITER_ID + "." + "tmp");
        Configuration conf = new Configuration();
        // Add all job configuration properties so they are picked up by Hadoop
        for (String key : properties.getPropertyNames()) {
            conf.set(key, properties.getProp(key));
        }
        FileSystem fs = FileSystem.get(URI.create(TestConstants.TEST_FS_URI), conf);

        OutputStream os = fs.create(stagingFile);
        os.write(randomBytesStage);
        os.flush();
        os.close();

        SimpleDataWriter writer = buildSimpleDataWriter();

        writer.write(randomBytesWrite);
        writer.close();
        writer.commit();

        Assert.assertEquals(writer.recordsWritten(), 1);
        Assert.assertEquals(writer.bytesWritten(), randomBytesWrite.length + 1);

        File writeFile = new File(writer.getOutputFilePath());
        int c, i = 0;
        InputStream is = new FileInputStream(writeFile);
        while ((c = is.read()) != -1) {
            if (i == 5) {
                Assert.assertEquals(c, (byte) newLine); // the last byte should be newline
                i++;
                continue;
            }
            Assert.assertEquals(randomBytesWrite[i], c);
            i++;
        }
    }

    @AfterMethod
    public void tearDown() throws IOException {
        // Clean up the staging and/or output directories if necessary
        File testRootDir = new File(TestConstants.TEST_ROOT_DIR);
        if (testRootDir.exists()) {
            FileUtil.fullyDelete(testRootDir);
        }
    }
}