com.google.cloud.dataflow.sdk.io.CompressedSourceTest.java Source code

Introduction

Here is the source code for com.google.cloud.dataflow.sdk.io.CompressedSourceTest.java
Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import static com.google.cloud.dataflow.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static com.google.cloud.dataflow.sdk.transforms.display.DisplayDataMatchers.includesDisplayDataFrom;

import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.not;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
import com.google.cloud.dataflow.sdk.io.BoundedSource.BoundedReader;
import com.google.cloud.dataflow.sdk.io.CompressedSource.CompressedReader;
import com.google.cloud.dataflow.sdk.io.CompressedSource.CompressionMode;
import com.google.cloud.dataflow.sdk.io.CompressedSource.DecompressingChannelFactory;
import com.google.cloud.dataflow.sdk.io.FileBasedSource.FileBasedReader;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
import com.google.cloud.dataflow.sdk.testing.SourceTestUtils;
import com.google.cloud.dataflow.sdk.testing.TestPipeline;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.common.io.Files;
import com.google.common.primitives.Bytes;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.hamcrest.Matchers;
import org.junit.Rule;
import org.junit.Test;
import org.junit.internal.matchers.ThrowableMessageMatcher;
import org.junit.rules.ExpectedException;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.zip.GZIPOutputStream;

import javax.annotation.Nullable;

/**
 * Tests for CompressedSource.
 */
@RunWith(JUnit4.class)
public class CompressedSourceTest {
    @Rule
    public TemporaryFolder tmpFolder = new TemporaryFolder();

    @Rule
    public ExpectedException thrown = ExpectedException.none();

    /**
     * Test reading nonempty input with gzip.
     */
    @Test
    public void testReadGzip() throws Exception {
        byte[] input = generateInput(5000);
        runReadTest(input, CompressionMode.GZIP);
    }

    /**
     * Test splittability of files in AUTO mode.
     */
    @Test
    public void testAutoSplittable() throws Exception {
        CompressedSource<Byte> source;

        // GZip files are not splittable
        source = CompressedSource.from(new ByteSource("input.gz", 1));
        assertFalse(source.isSplittable());
        source = CompressedSource.from(new ByteSource("input.GZ", 1));
        assertFalse(source.isSplittable());

        // BZ2 files are not splittable
        source = CompressedSource.from(new ByteSource("input.bz2", 1));
        assertFalse(source.isSplittable());
        source = CompressedSource.from(new ByteSource("input.BZ2", 1));
        assertFalse(source.isSplittable());

        // Other extensions are assumed to be splittable.
        source = CompressedSource.from(new ByteSource("input.txt", 1));
        assertTrue(source.isSplittable());
        source = CompressedSource.from(new ByteSource("input.csv", 1));
        assertTrue(source.isSplittable());
    }

    /**
     * Test splittability of files in GZIP mode -- none should be splittable.
     */
    @Test
    public void testGzipSplittable() throws Exception {
        CompressedSource<Byte> source;

        // GZip files are not splittable
        source = CompressedSource.from(new ByteSource("input.gz", 1)).withDecompression(CompressionMode.GZIP);
        assertFalse(source.isSplittable());
        source = CompressedSource.from(new ByteSource("input.GZ", 1)).withDecompression(CompressionMode.GZIP);
        assertFalse(source.isSplittable());

        // Other extensions are also not splittable.
        source = CompressedSource.from(new ByteSource("input.txt", 1)).withDecompression(CompressionMode.GZIP);
        assertFalse(source.isSplittable());
        source = CompressedSource.from(new ByteSource("input.csv", 1)).withDecompression(CompressionMode.GZIP);
        assertFalse(source.isSplittable());
    }

    /**
     * Test reading nonempty input with bzip2.
     */
    @Test
    public void testReadBzip2() throws Exception {
        byte[] input = generateInput(5000);
        runReadTest(input, CompressionMode.BZIP2);
    }

    /**
     * Test reading empty input with gzip.
     */
    @Test
    public void testEmptyReadGzip() throws Exception {
        byte[] input = generateInput(0);
        runReadTest(input, CompressionMode.GZIP);
    }

    private static byte[] compressGzip(byte[] input) throws IOException {
        ByteArrayOutputStream res = new ByteArrayOutputStream();
        try (GZIPOutputStream gzipStream = new GZIPOutputStream(res)) {
            gzipStream.write(input);
        }
        return res.toByteArray();
    }

    private static byte[] concat(byte[] first, byte[] second) {
        byte[] res = new byte[first.length + second.length];
        System.arraycopy(first, 0, res, 0, first.length);
        System.arraycopy(second, 0, res, first.length, second.length);
        return res;
    }

    /**
     * Test a concatenation of gzip files is correctly decompressed.
     *
     * <p>A concatenation of gzip files as one file is a valid gzip file and should decompress
     * to be the concatenation of those individual files.
     */
    @Test
    public void testReadConcatenatedGzip() throws IOException {
        byte[] header = "a,b,c\n".getBytes(StandardCharsets.UTF_8);
        byte[] body = "1,2,3\n4,5,6\n7,8,9\n".getBytes(StandardCharsets.UTF_8);
        byte[] expected = concat(header, body);
        byte[] totalGz = concat(compressGzip(header), compressGzip(body));
        File tmpFile = tmpFolder.newFile();
        try (FileOutputStream os = new FileOutputStream(tmpFile)) {
            os.write(totalGz);
        }

        Pipeline p = TestPipeline.create();

        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(tmpFile.getAbsolutePath(), 1))
                .withDecompression(CompressionMode.GZIP);
        PCollection<Byte> output = p.apply(Read.from(source));

        DataflowAssert.that(output).containsInAnyOrder(Bytes.asList(expected));
        p.run();
    }

    /**
     * Test reading empty input with bzip2.
     */
    @Test
    public void testCompressedReadBzip2() throws Exception {
        byte[] input = generateInput(0);
        runReadTest(input, CompressionMode.BZIP2);
    }

    /**
     * Test reading according to filepattern when the file is bzipped.
     */
    @Test
    public void testCompressedAccordingToFilepatternGzip() throws Exception {
        byte[] input = generateInput(100);
        File tmpFile = tmpFolder.newFile("test.gz");
        writeFile(tmpFile, input, CompressionMode.GZIP);
        verifyReadContents(input, tmpFile, null /* default auto decompression factory */);
    }

    /**
     * Test reading according to filepattern when the file is gzipped.
     */
    @Test
    public void testCompressedAccordingToFilepatternBzip2() throws Exception {
        byte[] input = generateInput(100);
        File tmpFile = tmpFolder.newFile("test.bz2");
        writeFile(tmpFile, input, CompressionMode.BZIP2);
        verifyReadContents(input, tmpFile, null /* default auto decompression factory */);
    }

    /**
     * Test reading multiple files with different compression.
     */
    @Test
    public void testHeterogeneousCompression() throws Exception {
        String baseName = "test-input";

        // Expected data
        byte[] generated = generateInput(1000);
        List<Byte> expected = new ArrayList<>();

        // Every sort of compression
        File uncompressedFile = tmpFolder.newFile(baseName + ".bin");
        generated = generateInput(1000);
        Files.write(generated, uncompressedFile);
        expected.addAll(Bytes.asList(generated));

        File gzipFile = tmpFolder.newFile(baseName + ".gz");
        generated = generateInput(1000);
        writeFile(gzipFile, generated, CompressionMode.GZIP);
        expected.addAll(Bytes.asList(generated));

        File bzip2File = tmpFolder.newFile(baseName + ".bz2");
        generated = generateInput(1000);
        writeFile(bzip2File, generateInput(1000), CompressionMode.BZIP2);
        expected.addAll(Bytes.asList(generated));

        String filePattern = new File(tmpFolder.getRoot().toString(), baseName + ".*").toString();

        Pipeline p = TestPipeline.create();

        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filePattern, 1));
        PCollection<Byte> output = p.apply(Read.from(source));

        DataflowAssert.that(output).containsInAnyOrder(expected);
        p.run();
    }

    @Test
    public void testUncompressedFileIsSplittable() throws Exception {
        String baseName = "test-input";

        File uncompressedFile = tmpFolder.newFile(baseName + ".bin");
        Files.write(generateInput(10), uncompressedFile);

        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(uncompressedFile.getPath(), 1));
        assertTrue(source.isSplittable());
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testGzipFileIsNotSplittable() throws Exception {
        String baseName = "test-input";

        File compressedFile = tmpFolder.newFile(baseName + ".gz");
        writeFile(compressedFile, generateInput(10), CompressionMode.GZIP);

        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1));
        assertFalse(source.isSplittable());
    }

    @Test
    public void testBzip2FileIsNotSplittable() throws Exception {
        String baseName = "test-input";

        File compressedFile = tmpFolder.newFile(baseName + ".bz2");
        writeFile(compressedFile, generateInput(10), CompressionMode.BZIP2);

        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1));
        assertFalse(source.isSplittable());
    }

    /**
     * Test reading an uncompressed file with {@link CompressionMode#GZIP}, since we must support
     * this due to properties of services that we read from.
     */
    @Test
    public void testFalseGzipStream() throws Exception {
        byte[] input = generateInput(1000);
        File tmpFile = tmpFolder.newFile("test.gz");
        Files.write(input, tmpFile);
        verifyReadContents(input, tmpFile, CompressionMode.GZIP);
    }

    /**
     * Test reading an uncompressed file with {@link CompressionMode#BZIP2}, and show that
     * we fail.
     */
    @Test
    public void testFalseBzip2Stream() throws Exception {
        byte[] input = generateInput(1000);
        File tmpFile = tmpFolder.newFile("test.bz2");
        Files.write(input, tmpFile);
        thrown.expectCause(Matchers.allOf(instanceOf(IOException.class),
                ThrowableMessageMatcher.hasMessage(containsString("Stream is not in the BZip2 format"))));
        verifyReadContents(input, tmpFile, CompressionMode.BZIP2);
    }

    /**
     * Test reading an empty input file with gzip; it must be interpreted as uncompressed because
     * the gzip header is two bytes.
     */
    @Test
    public void testEmptyReadGzipUncompressed() throws Exception {
        byte[] input = generateInput(0);
        File tmpFile = tmpFolder.newFile("test.gz");
        Files.write(input, tmpFile);
        verifyReadContents(input, tmpFile, CompressionMode.GZIP);
    }

    /**
     * Test reading single byte input with gzip; it must be interpreted as uncompressed because
     * the gzip header is two bytes.
     */
    @Test
    public void testOneByteReadGzipUncompressed() throws Exception {
        byte[] input = generateInput(1);
        File tmpFile = tmpFolder.newFile("test.gz");
        Files.write(input, tmpFile);
        verifyReadContents(input, tmpFile, CompressionMode.GZIP);
    }

    /**
     * Test reading multiple files.
     */
    @Test
    public void testCompressedReadMultipleFiles() throws Exception {
        int numFiles = 10;
        String baseName = "test_input-";
        String filePattern = new File(tmpFolder.getRoot().toString(), baseName + "*").toString();
        List<Byte> expected = new ArrayList<>();

        for (int i = 0; i < numFiles; i++) {
            byte[] generated = generateInput(1000);
            File tmpFile = tmpFolder.newFile(baseName + i);
            writeFile(tmpFile, generated, CompressionMode.GZIP);
            expected.addAll(Bytes.asList(generated));
        }

        Pipeline p = TestPipeline.create();

        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filePattern, 1))
                .withDecompression(CompressionMode.GZIP);
        PCollection<Byte> output = p.apply(Read.from(source));

        DataflowAssert.that(output).containsInAnyOrder(expected);
        p.run();
    }

    @Test
    public void testDisplayData() {
        ByteSource inputSource = new ByteSource("foobar.txt", 1) {
            @Override
            public void populateDisplayData(DisplayData.Builder builder) {
                builder.add(DisplayData.item("foo", "bar"));
            }
        };

        CompressedSource<?> compressedSource = CompressedSource.from(inputSource);
        CompressedSource<?> gzipSource = compressedSource.withDecompression(CompressionMode.GZIP);

        DisplayData compressedSourceDisplayData = DisplayData.from(compressedSource);
        DisplayData gzipDisplayData = DisplayData.from(gzipSource);

        assertThat(compressedSourceDisplayData, hasDisplayItem("compressionMode"));
        assertThat(gzipDisplayData, hasDisplayItem("compressionMode", CompressionMode.GZIP.toString()));
        assertThat(compressedSourceDisplayData, hasDisplayItem("source", inputSource.getClass()));
        assertThat(compressedSourceDisplayData, includesDisplayDataFrom(inputSource));
    }

    /**
     * Generate byte array of given size.
     */
    private byte[] generateInput(int size) {
        // Arbitrary but fixed seed
        Random random = new Random(285930);
        byte[] buff = new byte[size];
        random.nextBytes(buff);
        return buff;
    }

    /**
     * Get a compressing stream for a given compression mode.
     */
    private OutputStream getOutputStreamForMode(CompressionMode mode, OutputStream stream) throws IOException {
        switch (mode) {
        case GZIP:
            return new GzipCompressorOutputStream(stream);
        case BZIP2:
            return new BZip2CompressorOutputStream(stream);
        default:
            throw new RuntimeException("Unexpected compression mode");
        }
    }

    /**
     * Writes a single output file.
     */
    private void writeFile(File file, byte[] input, CompressionMode mode) throws IOException {
        try (OutputStream os = getOutputStreamForMode(mode, new FileOutputStream(file))) {
            os.write(input);
        }
    }

    /**
     * Run a single read test, writing and reading back input with the given compression mode.
     */
    private void runReadTest(byte[] input, CompressionMode inputCompressionMode,
            @Nullable DecompressingChannelFactory decompressionFactory) throws IOException {
        File tmpFile = tmpFolder.newFile();
        writeFile(tmpFile, input, inputCompressionMode);
        verifyReadContents(input, tmpFile, decompressionFactory);
    }

    private void verifyReadContents(byte[] expected, File inputFile,
            @Nullable DecompressingChannelFactory decompressionFactory) {
        Pipeline p = TestPipeline.create();
        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(inputFile.toPath().toString(), 1));
        if (decompressionFactory != null) {
            source = source.withDecompression(decompressionFactory);
        }
        PCollection<Byte> output = p.apply(Read.from(source));
        DataflowAssert.that(output).containsInAnyOrder(Bytes.asList(expected));
        p.run();
    }

    /**
     * Run a single read test, writing and reading back input with the given compression mode.
     */
    private void runReadTest(byte[] input, CompressionMode mode) throws IOException {
        runReadTest(input, mode, mode);
    }

    /**
     * Dummy source for use in tests.
     */
    private static class ByteSource extends FileBasedSource<Byte> {
        public ByteSource(String fileOrPatternSpec, long minBundleSize) {
            super(fileOrPatternSpec, minBundleSize);
        }

        public ByteSource(String fileName, long minBundleSize, long startOffset, long endOffset) {
            super(fileName, minBundleSize, startOffset, endOffset);
        }

        @Override
        protected FileBasedSource<Byte> createForSubrangeOfFile(String fileName, long start, long end) {
            return new ByteSource(fileName, getMinBundleSize(), start, end);
        }

        @Override
        protected FileBasedReader<Byte> createSingleFileReader(PipelineOptions options) {
            return new ByteReader(this);
        }

        @Override
        public boolean producesSortedKeys(PipelineOptions options) throws Exception {
            return false;
        }

        @Override
        public Coder<Byte> getDefaultOutputCoder() {
            return SerializableCoder.of(Byte.class);
        }

        private static class ByteReader extends FileBasedReader<Byte> {
            ByteBuffer buff = ByteBuffer.allocate(1);
            Byte current;
            long offset;
            ReadableByteChannel channel;

            public ByteReader(ByteSource source) {
                super(source);
                offset = source.getStartOffset() - 1;
            }

            @Override
            public Byte getCurrent() throws NoSuchElementException {
                return current;
            }

            @Override
            protected boolean isAtSplitPoint() {
                return true;
            }

            @Override
            protected void startReading(ReadableByteChannel channel) throws IOException {
                this.channel = channel;
            }

            @Override
            protected boolean readNextRecord() throws IOException {
                buff.clear();
                if (channel.read(buff) != 1) {
                    return false;
                }
                current = buff.get(0);
                offset += 1;
                return true;
            }

            @Override
            protected long getCurrentOffset() {
                return offset;
            }
        }
    }

    @Test
    public void testEmptyGzipProgress() throws IOException {
        File tmpFile = tmpFolder.newFile("empty.gz");
        String filename = tmpFile.toPath().toString();
        writeFile(tmpFile, new byte[0], CompressionMode.GZIP);

        PipelineOptions options = PipelineOptionsFactory.create();
        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filename, 1));
        try (BoundedReader<Byte> readerOrig = source.createReader(options)) {
            assertThat(readerOrig, instanceOf(CompressedReader.class));
            CompressedReader<Byte> reader = (CompressedReader<Byte>) readerOrig;
            // before starting
            assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(1, reader.getSplitPointsRemaining());

            // confirm empty
            assertFalse(reader.start());

            // after reading empty source
            assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(0, reader.getSplitPointsRemaining());
        }
    }

    @Test
    public void testGzipProgress() throws IOException {
        int numRecords = 3;
        File tmpFile = tmpFolder.newFile("nonempty.gz");
        String filename = tmpFile.toPath().toString();
        writeFile(tmpFile, new byte[numRecords], CompressionMode.GZIP);

        PipelineOptions options = PipelineOptionsFactory.create();
        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filename, 1));
        try (BoundedReader<Byte> readerOrig = source.createReader(options)) {
            assertThat(readerOrig, instanceOf(CompressedReader.class));
            CompressedReader<Byte> reader = (CompressedReader<Byte>) readerOrig;
            // before starting
            assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(1, reader.getSplitPointsRemaining());

            // confirm has three records
            for (int i = 0; i < numRecords; ++i) {
                if (i == 0) {
                    assertTrue(reader.start());
                } else {
                    assertTrue(reader.advance());
                }
                assertEquals(0, reader.getSplitPointsConsumed());
                assertEquals(1, reader.getSplitPointsRemaining());
            }
            assertFalse(reader.advance());

            // after reading empty source
            assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(1, reader.getSplitPointsConsumed());
            assertEquals(0, reader.getSplitPointsRemaining());
        }
    }

    @Test
    public void testSplittableProgress() throws IOException {
        File tmpFile = tmpFolder.newFile("nonempty.txt");
        String filename = tmpFile.toPath().toString();
        Files.write(new byte[2], tmpFile);

        PipelineOptions options = PipelineOptionsFactory.create();
        CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filename, 1));
        try (BoundedReader<Byte> readerOrig = source.createReader(options)) {
            assertThat(readerOrig, not(instanceOf(CompressedReader.class)));
            assertThat(readerOrig, instanceOf(FileBasedReader.class));
            FileBasedReader<Byte> reader = (FileBasedReader<Byte>) readerOrig;

            // Check preconditions before starting
            assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

            // First record: none consumed, unknown remaining.
            assertTrue(reader.start());
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

            // Second record: 1 consumed, know that we're on the last record.
            assertTrue(reader.advance());
            assertEquals(1, reader.getSplitPointsConsumed());
            assertEquals(1, reader.getSplitPointsRemaining());

            // Confirm empty and check post-conditions
            assertFalse(reader.advance());
            assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(2, reader.getSplitPointsConsumed());
            assertEquals(0, reader.getSplitPointsRemaining());
        }
    }
}