org.apache.beam.sdk.io.TextIOTest.java Source code

Introduction

Here is the source code for org.apache.beam.sdk.io.TextIOTest.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import static com.google.common.base.MoreObjects.firstNonNull;
import static org.apache.beam.sdk.TestUtils.LINES2_ARRAY;
import static org.apache.beam.sdk.TestUtils.LINES_ARRAY;
import static org.apache.beam.sdk.TestUtils.NO_LINES_ARRAY;
import static org.apache.beam.sdk.io.TextIO.CompressionType.AUTO;
import static org.apache.beam.sdk.io.TextIO.CompressionType.BZIP2;
import static org.apache.beam.sdk.io.TextIO.CompressionType.DEFLATE;
import static org.apache.beam.sdk.io.TextIO.CompressionType.GZIP;
import static org.apache.beam.sdk.io.TextIO.CompressionType.UNCOMPRESSED;
import static org.apache.beam.sdk.io.TextIO.CompressionType.ZIP;
import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasValue;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.startsWith;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.annotation.Nullable;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.BoundedSource.BoundedReader;
import org.apache.beam.sdk.io.FileBasedSink.WritableByteChannelFactory;
import org.apache.beam.sdk.io.TextIO.CompressionType;
import org.apache.beam.sdk.io.fs.MatchResult;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.testing.NeedsRunner;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.SourceTestUtils;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.testing.ValidatesRunner;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.display.DisplayDataEvaluator;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.values.PCollection;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.compress.compressors.deflate.DeflateCompressorOutputStream;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.ExpectedException;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

/**
 * Tests for {@link TextIO} {@link TextIO.Read} and {@link TextIO.Write} transforms.
 */
// TODO: Change the tests to use ValidatesRunner instead of NeedsRunner
@RunWith(JUnit4.class)
@SuppressWarnings("unchecked")
public class TextIOTest {
    private static final String MY_HEADER = "myHeader";
    private static final String MY_FOOTER = "myFooter";
    private static final String[] EMPTY = new String[] {};
    private static final String[] TINY = new String[] { "Irritable eagle", "Optimistic jay", "Fanciful hawk" };
    private static final String[] LARGE = makeLines(1000);

    private static Path tempFolder;
    private static File emptyTxt;
    private static File tinyTxt;
    private static File largeTxt;
    private static File emptyGz;
    private static File tinyGz;
    private static File largeGz;
    private static File emptyBzip2;
    private static File tinyBzip2;
    private static File largeBzip2;
    private static File emptyZip;
    private static File tinyZip;
    private static File largeZip;
    private static File emptyDeflate;
    private static File tinyDeflate;
    private static File largeDeflate;

    @Rule
    public TestPipeline p = TestPipeline.create();

    @Rule
    public ExpectedException expectedException = ExpectedException.none();

    private static File writeToFile(String[] lines, String filename, CompressionType compression)
            throws IOException {
        File file = tempFolder.resolve(filename).toFile();
        OutputStream output = new FileOutputStream(file);
        switch (compression) {
        case UNCOMPRESSED:
            break;
        case GZIP:
            output = new GZIPOutputStream(output);
            break;
        case BZIP2:
            output = new BZip2CompressorOutputStream(output);
            break;
        case ZIP:
            ZipOutputStream zipOutput = new ZipOutputStream(output);
            zipOutput.putNextEntry(new ZipEntry("entry"));
            output = zipOutput;
            break;
        case DEFLATE:
            output = new DeflateCompressorOutputStream(output);
            break;
        default:
            throw new UnsupportedOperationException(compression.toString());
        }
        writeToStreamAndClose(lines, output);
        return file;
    }

    @BeforeClass
    public static void setupClass() throws IOException {
        tempFolder = Files.createTempDirectory("TextIOTest");
        // empty files
        emptyTxt = writeToFile(EMPTY, "empty.txt", CompressionType.UNCOMPRESSED);
        emptyGz = writeToFile(EMPTY, "empty.gz", GZIP);
        emptyBzip2 = writeToFile(EMPTY, "empty.bz2", BZIP2);
        emptyZip = writeToFile(EMPTY, "empty.zip", ZIP);
        emptyDeflate = writeToFile(EMPTY, "empty.deflate", DEFLATE);
        // tiny files
        tinyTxt = writeToFile(TINY, "tiny.txt", CompressionType.UNCOMPRESSED);
        tinyGz = writeToFile(TINY, "tiny.gz", GZIP);
        tinyBzip2 = writeToFile(TINY, "tiny.bz2", BZIP2);
        tinyZip = writeToFile(TINY, "tiny.zip", ZIP);
        tinyDeflate = writeToFile(TINY, "tiny.deflate", DEFLATE);
        // large files
        largeTxt = writeToFile(LARGE, "large.txt", CompressionType.UNCOMPRESSED);
        largeGz = writeToFile(LARGE, "large.gz", GZIP);
        largeBzip2 = writeToFile(LARGE, "large.bz2", BZIP2);
        largeZip = writeToFile(LARGE, "large.zip", ZIP);
        largeDeflate = writeToFile(LARGE, "large.deflate", DEFLATE);
    }

    @AfterClass
    public static void teardownClass() throws IOException {
        Files.walkFileTree(tempFolder, new SimpleFileVisitor<Path>() {
            @Override
            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                Files.delete(file);
                return FileVisitResult.CONTINUE;
            }

            @Override
            public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
                Files.delete(dir);
                return FileVisitResult.CONTINUE;
            }
        });
    }

    private <T> void runTestRead(String[] expected) throws Exception {
        File tmpFile = Files.createTempFile(tempFolder, "file", "txt").toFile();
        String filename = tmpFile.getPath();

        try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) {
            for (String elem : expected) {
                byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem);
                String line = new String(encodedElem);
                writer.println(line);
            }
        }

        TextIO.Read read = TextIO.read().from(filename);

        PCollection<String> output = p.apply(read);

        PAssert.that(output).containsInAnyOrder(expected);
        p.run();
    }

    @Test
    @Category(NeedsRunner.class)
    public void testReadStrings() throws Exception {
        runTestRead(LINES_ARRAY);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testReadEmptyStrings() throws Exception {
        runTestRead(NO_LINES_ARRAY);
    }

    @Test
    public void testReadNamed() throws Exception {
        p.enableAbandonedNodeEnforcement(false);

        assertEquals("TextIO.Read/Read.out", p.apply(TextIO.read().from("somefile")).getName());
        assertEquals("MyRead/Read.out", p.apply("MyRead", TextIO.read().from(emptyTxt.getPath())).getName());
    }

    @Test
    public void testReadDisplayData() {
        TextIO.Read read = TextIO.read().from("foo.*").withCompressionType(BZIP2);

        DisplayData displayData = DisplayData.from(read);

        assertThat(displayData, hasDisplayItem("filePattern", "foo.*"));
        assertThat(displayData, hasDisplayItem("compressionType", BZIP2.toString()));
    }

    @Test
    @Category(ValidatesRunner.class)
    public void testPrimitiveReadDisplayData() {
        DisplayDataEvaluator evaluator = DisplayDataEvaluator.create();

        TextIO.Read read = TextIO.read().from("foobar");

        Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read);
        assertThat("TextIO.Read should include the file prefix in its primitive display data", displayData,
                hasItem(hasDisplayItem(hasValue(startsWith("foobar")))));
    }

    private void runTestWrite(String[] elems) throws Exception {
        runTestWrite(elems, null, null, 1);
    }

    private void runTestWrite(String[] elems, int numShards) throws Exception {
        runTestWrite(elems, null, null, numShards);
    }

    private void runTestWrite(String[] elems, String header, String footer) throws Exception {
        runTestWrite(elems, header, footer, 1);
    }

    private void runTestWrite(String[] elems, String header, String footer, int numShards) throws Exception {
        String outputName = "file.txt";
        Path baseDir = Files.createTempDirectory(tempFolder, "testwrite");
        String baseFilename = baseDir.resolve(outputName).toString();

        PCollection<String> input = p.apply(Create.of(Arrays.asList(elems)).withCoder(StringUtf8Coder.of()));

        TextIO.Write write = TextIO.write().to(baseFilename).withHeader(header).withFooter(footer);

        if (numShards == 1) {
            write = write.withoutSharding();
        } else if (numShards > 0) {
            write = write.withNumShards(numShards).withShardNameTemplate(ShardNameTemplate.INDEX_OF_MAX);
        }

        input.apply(write);

        p.run();

        assertOutputFiles(elems, header, footer, numShards, baseDir, outputName,
                firstNonNull(write.getShardTemplate(), DefaultFilenamePolicy.DEFAULT_SHARD_TEMPLATE));
    }

    public static void assertOutputFiles(String[] elems, final String header, final String footer, int numShards,
            Path rootLocation, String outputName, String shardNameTemplate) throws Exception {
        List<File> expectedFiles = new ArrayList<>();
        if (numShards == 0) {
            String pattern = rootLocation.toAbsolutePath().resolve(outputName + "*").toString();
            List<MatchResult> matches = FileSystems.match(Collections.singletonList(pattern));
            for (Metadata expectedFile : Iterables.getOnlyElement(matches).metadata()) {
                expectedFiles.add(new File(expectedFile.resourceId().toString()));
            }
        } else {
            for (int i = 0; i < numShards; i++) {
                expectedFiles.add(new File(rootLocation.toString(),
                        DefaultFilenamePolicy.constructName(outputName, shardNameTemplate, "", i, numShards)));
            }
        }

        List<List<String>> actual = new ArrayList<>();

        for (File tmpFile : expectedFiles) {
            try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) {
                List<String> currentFile = new ArrayList<>();
                for (;;) {
                    String line = reader.readLine();
                    if (line == null) {
                        break;
                    }
                    currentFile.add(line);
                }
                actual.add(currentFile);
            }
        }

        List<String> expectedElements = new ArrayList<>(elems.length);
        for (String elem : elems) {
            byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem);
            String line = new String(encodedElem);
            expectedElements.add(line);
        }

        List<String> actualElements = Lists.newArrayList(Iterables
                .concat(FluentIterable.from(actual).transform(removeHeaderAndFooter(header, footer)).toList()));

        assertThat(actualElements, containsInAnyOrder(expectedElements.toArray()));

        assertTrue(Iterables.all(actual, haveProperHeaderAndFooter(header, footer)));
    }

    private static Function<List<String>, List<String>> removeHeaderAndFooter(final String header,
            final String footer) {
        return new Function<List<String>, List<String>>() {
            @Nullable
            @Override
            public List<String> apply(List<String> lines) {
                ArrayList<String> newLines = Lists.newArrayList(lines);
                if (header != null) {
                    newLines.remove(0);
                }
                if (footer != null) {
                    int last = newLines.size() - 1;
                    newLines.remove(last);
                }
                return newLines;
            }
        };
    }

    private static Predicate<List<String>> haveProperHeaderAndFooter(final String header, final String footer) {
        return new Predicate<List<String>>() {
            @Override
            public boolean apply(List<String> fileLines) {
                int last = fileLines.size() - 1;
                return (header == null || fileLines.get(0).equals(header))
                        && (footer == null || fileLines.get(last).equals(footer));
            }
        };
    }

    @Test
    @Category(NeedsRunner.class)
    public void testWriteStrings() throws Exception {
        runTestWrite(LINES_ARRAY);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testWriteEmptyStringsNoSharding() throws Exception {
        runTestWrite(NO_LINES_ARRAY, 0);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testWriteEmptyStrings() throws Exception {
        runTestWrite(NO_LINES_ARRAY);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testShardedWrite() throws Exception {
        runTestWrite(LINES_ARRAY, 5);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testWriteWithHeader() throws Exception {
        runTestWrite(LINES_ARRAY, MY_HEADER, null);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testWriteWithFooter() throws Exception {
        runTestWrite(LINES_ARRAY, null, MY_FOOTER);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testWriteWithHeaderAndFooter() throws Exception {
        runTestWrite(LINES_ARRAY, MY_HEADER, MY_FOOTER);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testWriteWithWritableByteChannelFactory() throws Exception {
        Coder<String> coder = StringUtf8Coder.of();
        String outputName = "file.txt";
        Path baseDir = Files.createTempDirectory(tempFolder, "testwrite");

        PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES2_ARRAY)).withCoder(coder));

        final WritableByteChannelFactory writableByteChannelFactory = new DrunkWritableByteChannelFactory();
        TextIO.Write write = TextIO.write().to(baseDir.resolve(outputName).toString()).withoutSharding()
                .withWritableByteChannelFactory(writableByteChannelFactory);
        DisplayData displayData = DisplayData.from(write);
        assertThat(displayData, hasDisplayItem("writableByteChannelFactory", "DRUNK"));

        input.apply(write);

        p.run();

        final List<String> drunkElems = new ArrayList<>(LINES2_ARRAY.length * 2 + 2);
        for (String elem : LINES2_ARRAY) {
            drunkElems.add(elem);
            drunkElems.add(elem);
        }
        assertOutputFiles(drunkElems.toArray(new String[0]), null, null, 1, baseDir,
                outputName + writableByteChannelFactory.getFilenameSuffix(), write.getShardTemplate());
    }

    @Test
    public void testWriteDisplayData() {
        TextIO.Write write = TextIO.write().to("/foo").withSuffix("bar").withShardNameTemplate("-SS-of-NN-")
                .withNumShards(100).withFooter("myFooter").withHeader("myHeader");

        DisplayData displayData = DisplayData.from(write);

        assertThat(displayData, hasDisplayItem("filePrefix", "/foo"));
        assertThat(displayData, hasDisplayItem("fileSuffix", "bar"));
        assertThat(displayData, hasDisplayItem("fileHeader", "myHeader"));
        assertThat(displayData, hasDisplayItem("fileFooter", "myFooter"));
        assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-"));
        assertThat(displayData, hasDisplayItem("numShards", 100));
        assertThat(displayData, hasDisplayItem("writableByteChannelFactory", "UNCOMPRESSED"));
    }

    @Test
    public void testWriteDisplayDataValidateThenHeader() {
        TextIO.Write write = TextIO.write().to("foo").withHeader("myHeader");

        DisplayData displayData = DisplayData.from(write);

        assertThat(displayData, hasDisplayItem("fileHeader", "myHeader"));
    }

    @Test
    public void testWriteDisplayDataValidateThenFooter() {
        TextIO.Write write = TextIO.write().to("foo").withFooter("myFooter");

        DisplayData displayData = DisplayData.from(write);

        assertThat(displayData, hasDisplayItem("fileFooter", "myFooter"));
    }

    /** Options for testing. */
    public interface RuntimeTestOptions extends PipelineOptions {
        ValueProvider<String> getInput();

        void setInput(ValueProvider<String> value);

        ValueProvider<String> getOutput();

        void setOutput(ValueProvider<String> value);
    }

    @Test
    public void testRuntimeOptionsNotCalledInApply() throws Exception {
        p.enableAbandonedNodeEnforcement(false);

        RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class);

        p.apply(TextIO.read().from(options.getInput())).apply(TextIO.write().to(options.getOutput()));
    }

    @Test
    public void testCompressionTypeIsSet() throws Exception {
        TextIO.Read read = TextIO.read().from("/tmp/test");
        assertEquals(AUTO, read.getCompressionType());
        read = TextIO.read().from("/tmp/test").withCompressionType(GZIP);
        assertEquals(GZIP, read.getCompressionType());
    }

    /**
     * Helper that writes the given lines (adding a newline in between) to a stream, then closes the
     * stream.
     */
    private static void writeToStreamAndClose(String[] lines, OutputStream outputStream) {
        try (PrintStream writer = new PrintStream(outputStream)) {
            for (String line : lines) {
                writer.println(line);
            }
        }
    }

    /**
     * Helper method that runs TextIO.read().from(filename).withCompressionType(compressionType)
     * and asserts that the results match the given expected output.
     */
    private void assertReadingCompressedFileMatchesExpected(File file, CompressionType compressionType,
            String[] expected) {

        TextIO.Read read = TextIO.read().from(file.getPath()).withCompressionType(compressionType);
        PCollection<String> output = p.apply("Read_" + file + "_" + compressionType.toString(), read);

        PAssert.that(output).containsInAnyOrder(expected);
        p.run();
    }

    /**
     * Helper to make an array of compressible strings. Returns ["word"i] for i in range(0,n).
     */
    private static String[] makeLines(int n) {
        String[] ret = new String[n];
        for (int i = 0; i < n; ++i) {
            ret[i] = "word" + i;
        }
        return ret;
    }

    /**
     * Tests reading from a small, gzipped file with no .gz extension but GZIP compression set.
     */
    @Test
    @Category(NeedsRunner.class)
    public void testSmallCompressedGzipReadNoExtension() throws Exception {
        File smallGzNoExtension = writeToFile(TINY, "tiny_gz_no_extension", GZIP);
        assertReadingCompressedFileMatchesExpected(smallGzNoExtension, GZIP, TINY);
    }

    /**
     * Tests reading from a small, uncompressed file with .gz extension. This must work in AUTO or
     * GZIP modes. This is needed because some network file systems / HTTP clients will transparently
     * decompress gzipped content.
     */
    @Test
    @Category(NeedsRunner.class)
    public void testSmallCompressedGzipReadActuallyUncompressed() throws Exception {
        File smallGzNotCompressed = writeToFile(TINY, "tiny_uncompressed.gz", CompressionType.UNCOMPRESSED);
        // Should work with GZIP compression set.
        assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, GZIP, TINY);
        // Should also work with AUTO mode set.
        assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, AUTO, TINY);
    }

    /**
     * Tests reading from a small, bzip2ed file with no .bz2 extension but BZIP2 compression set.
     */
    @Test
    @Category(NeedsRunner.class)
    public void testSmallCompressedBzip2ReadNoExtension() throws Exception {
        File smallBz2NoExtension = writeToFile(TINY, "tiny_bz2_no_extension", BZIP2);
        assertReadingCompressedFileMatchesExpected(smallBz2NoExtension, BZIP2, TINY);
    }

    /**
     * Create a zip file with the given lines.
     *
     * @param expected A list of expected lines, populated in the zip file.
     * @param filename Optionally zip file name (can be null).
     * @param fieldsEntries Fields to write in zip entries.
     * @return The zip filename.
     * @throws Exception In case of a failure during zip file creation.
     */
    private String createZipFile(List<String> expected, String filename, String[]... fieldsEntries)
            throws Exception {
        File tmpFile = tempFolder.resolve(filename).toFile();
        String tmpFileName = tmpFile.getPath();

        ZipOutputStream out = new ZipOutputStream(new FileOutputStream(tmpFile));
        PrintStream writer = new PrintStream(out, true /* auto-flush on write */);

        int index = 0;
        for (String[] entry : fieldsEntries) {
            out.putNextEntry(new ZipEntry(Integer.toString(index)));
            for (String field : entry) {
                writer.println(field);
                expected.add(field);
            }
            out.closeEntry();
            index++;
        }

        writer.close();
        out.close();

        return tmpFileName;
    }

    @Test
    @Category(NeedsRunner.class)
    public void testTxtRead() throws Exception {
        // Files with non-compressed extensions should work in AUTO and UNCOMPRESSED modes.
        for (CompressionType type : new CompressionType[] { AUTO, UNCOMPRESSED }) {
            assertReadingCompressedFileMatchesExpected(emptyTxt, type, EMPTY);
            assertReadingCompressedFileMatchesExpected(tinyTxt, type, TINY);
            assertReadingCompressedFileMatchesExpected(largeTxt, type, LARGE);
        }
    }

    @Test
    @Category(NeedsRunner.class)
    public void testGzipCompressedRead() throws Exception {
        // Files with the right extensions should work in AUTO and GZIP modes.
        for (CompressionType type : new CompressionType[] { AUTO, GZIP }) {
            assertReadingCompressedFileMatchesExpected(emptyGz, type, EMPTY);
            assertReadingCompressedFileMatchesExpected(tinyGz, type, TINY);
            assertReadingCompressedFileMatchesExpected(largeGz, type, LARGE);
        }

        // Sanity check that we're properly testing compression.
        assertThat(largeTxt.length(), greaterThan(largeGz.length()));

        // GZIP files with non-gz extension should work in GZIP mode.
        File gzFile = writeToFile(TINY, "tiny_gz_no_extension", GZIP);
        assertReadingCompressedFileMatchesExpected(gzFile, GZIP, TINY);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testBzip2CompressedRead() throws Exception {
        // Files with the right extensions should work in AUTO and BZIP2 modes.
        for (CompressionType type : new CompressionType[] { AUTO, BZIP2 }) {
            assertReadingCompressedFileMatchesExpected(emptyBzip2, type, EMPTY);
            assertReadingCompressedFileMatchesExpected(tinyBzip2, type, TINY);
            assertReadingCompressedFileMatchesExpected(largeBzip2, type, LARGE);
        }

        // Sanity check that we're properly testing compression.
        assertThat(largeTxt.length(), greaterThan(largeBzip2.length()));

        // BZ2 files with non-bz2 extension should work in BZIP2 mode.
        File bz2File = writeToFile(TINY, "tiny_bz2_no_extension", BZIP2);
        assertReadingCompressedFileMatchesExpected(bz2File, BZIP2, TINY);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testZipCompressedRead() throws Exception {
        // Files with the right extensions should work in AUTO and ZIP modes.
        for (CompressionType type : new CompressionType[] { AUTO, ZIP }) {
            assertReadingCompressedFileMatchesExpected(emptyZip, type, EMPTY);
            assertReadingCompressedFileMatchesExpected(tinyZip, type, TINY);
            assertReadingCompressedFileMatchesExpected(largeZip, type, LARGE);
        }

        // Sanity check that we're properly testing compression.
        assertThat(largeTxt.length(), greaterThan(largeZip.length()));

        // Zip files with non-zip extension should work in ZIP mode.
        File zipFile = writeToFile(TINY, "tiny_zip_no_extension", ZIP);
        assertReadingCompressedFileMatchesExpected(zipFile, ZIP, TINY);
    }

    @Test
    @Category(NeedsRunner.class)
    public void testDeflateCompressedRead() throws Exception {
        // Files with the right extensions should work in AUTO and ZIP modes.
        for (CompressionType type : new CompressionType[] { AUTO, DEFLATE }) {
            assertReadingCompressedFileMatchesExpected(emptyDeflate, type, EMPTY);
            assertReadingCompressedFileMatchesExpected(tinyDeflate, type, TINY);
            assertReadingCompressedFileMatchesExpected(largeDeflate, type, LARGE);
        }

        // Sanity check that we're properly testing compression.
        assertThat(largeTxt.length(), greaterThan(largeDeflate.length()));

        // Deflate files with non-deflate extension should work in DEFLATE mode.
        File deflateFile = writeToFile(TINY, "tiny_deflate_no_extension", DEFLATE);
        assertReadingCompressedFileMatchesExpected(deflateFile, DEFLATE, TINY);
    }

    /**
     * Tests a zip file with no entries. This is a corner case not tested elsewhere as the default
     * test zip files have a single entry.
     */
    @Test
    @Category(NeedsRunner.class)
    public void testZipCompressedReadWithNoEntries() throws Exception {
        String filename = createZipFile(new ArrayList<String>(), "empty zip file");
        assertReadingCompressedFileMatchesExpected(new File(filename), CompressionType.ZIP, EMPTY);
    }

    /**
     * Tests a zip file with multiple entries. This is a corner case not tested elsewhere as the
     * default test zip files have a single entry.
     */
    @Test
    @Category(NeedsRunner.class)
    public void testZipCompressedReadWithMultiEntriesFile() throws Exception {
        String[] entry0 = new String[] { "first", "second", "three" };
        String[] entry1 = new String[] { "four", "five", "six" };
        String[] entry2 = new String[] { "seven", "eight", "nine" };

        List<String> expected = new ArrayList<>();

        String filename = createZipFile(expected, "multiple entries", entry0, entry1, entry2);
        assertReadingCompressedFileMatchesExpected(new File(filename), CompressionType.ZIP,
                expected.toArray(new String[] {}));
    }

    /**
     * Read a ZIP compressed file containing data, multiple empty entries, and then more data. We
     * expect just the data back.
     */
    @Test
    @Category(NeedsRunner.class)
    public void testZipCompressedReadWithComplexEmptyAndPresentEntries() throws Exception {
        String filename = createZipFile(new ArrayList<String>(), "complex empty and present entries",
                new String[] { "cat" }, new String[] {}, new String[] {}, new String[] { "dog" });

        assertReadingCompressedFileMatchesExpected(new File(filename), CompressionType.ZIP,
                new String[] { "cat", "dog" });
    }

    @Test
    public void testTextIOGetName() {
        assertEquals("TextIO.Read", TextIO.read().from("somefile").getName());
        assertEquals("TextIO.Write", TextIO.write().to("somefile").getName());
        assertEquals("TextIO.Read", TextIO.read().from("somefile").toString());
    }

    @Test
    public void testProgressEmptyFile() throws IOException {
        try (BoundedReader<String> reader = prepareSource(new byte[0])
                .createReader(PipelineOptionsFactory.create())) {
            // Check preconditions before starting.
            assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

            // Assert empty
            assertFalse(reader.start());

            // Check postconditions after finishing
            assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(0, reader.getSplitPointsRemaining());
        }
    }

    @Test
    public void testProgressTextFile() throws IOException {
        String file = "line1\nline2\nline3";
        try (BoundedReader<String> reader = prepareSource(file.getBytes())
                .createReader(PipelineOptionsFactory.create())) {
            // Check preconditions before starting
            assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

            // Line 1
            assertTrue(reader.start());
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

            // Line 2
            assertTrue(reader.advance());
            assertEquals(1, reader.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

            // Line 3
            assertTrue(reader.advance());
            assertEquals(2, reader.getSplitPointsConsumed());
            assertEquals(1, reader.getSplitPointsRemaining());

            // Check postconditions after finishing
            assertFalse(reader.advance());
            assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(3, reader.getSplitPointsConsumed());
            assertEquals(0, reader.getSplitPointsRemaining());
        }
    }

    @Test
    public void testProgressAfterSplitting() throws IOException {
        String file = "line1\nline2\nline3";
        BoundedSource<String> source = prepareSource(file.getBytes());
        BoundedSource<String> remainder;

        // Create the remainder, verifying properties pre- and post-splitting.
        try (BoundedReader<String> readerOrig = source.createReader(PipelineOptionsFactory.create())) {
            // Preconditions.
            assertEquals(0.0, readerOrig.getFractionConsumed(), 1e-6);
            assertEquals(0, readerOrig.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining());

            // First record, before splitting.
            assertTrue(readerOrig.start());
            assertEquals(0, readerOrig.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining());

            // Split. 0.1 is in line1, so should now be able to detect last record.
            remainder = readerOrig.splitAtFraction(0.1);
            System.err.println(readerOrig.getCurrentSource());
            assertNotNull(remainder);

            // First record, after splitting.
            assertEquals(0, readerOrig.getSplitPointsConsumed());
            assertEquals(1, readerOrig.getSplitPointsRemaining());

            // Finish and postconditions.
            assertFalse(readerOrig.advance());
            assertEquals(1.0, readerOrig.getFractionConsumed(), 1e-6);
            assertEquals(1, readerOrig.getSplitPointsConsumed());
            assertEquals(0, readerOrig.getSplitPointsRemaining());
        }

        // Check the properties of the remainder.
        try (BoundedReader<String> reader = remainder.createReader(PipelineOptionsFactory.create())) {
            // Preconditions.
            assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

            // First record should be line 2.
            assertTrue(reader.start());
            assertEquals(0, reader.getSplitPointsConsumed());
            assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

            // Second record is line 3
            assertTrue(reader.advance());
            assertEquals(1, reader.getSplitPointsConsumed());
            assertEquals(1, reader.getSplitPointsRemaining());

            // Check postconditions after finishing
            assertFalse(reader.advance());
            assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
            assertEquals(2, reader.getSplitPointsConsumed());
            assertEquals(0, reader.getSplitPointsRemaining());
        }
    }

    @Test
    public void testReadEmptyLines() throws Exception {
        runTestReadWithData("\n\n\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("", "", ""));
    }

    @Test
    public void testReadFileWithLineFeedDelimiter() throws Exception {
        runTestReadWithData("asdf\nhjkl\nxyz\n".getBytes(StandardCharsets.UTF_8),
                ImmutableList.of("asdf", "hjkl", "xyz"));
    }

    @Test
    public void testReadFileWithCarriageReturnDelimiter() throws Exception {
        runTestReadWithData("asdf\rhjkl\rxyz\r".getBytes(StandardCharsets.UTF_8),
                ImmutableList.of("asdf", "hjkl", "xyz"));
    }

    @Test
    public void testReadFileWithCarriageReturnAndLineFeedDelimiter() throws Exception {
        runTestReadWithData("asdf\r\nhjkl\r\nxyz\r\n".getBytes(StandardCharsets.UTF_8),
                ImmutableList.of("asdf", "hjkl", "xyz"));
    }

    @Test
    public void testReadFileWithMixedDelimiters() throws Exception {
        runTestReadWithData("asdf\rhjkl\r\nxyz\n".getBytes(StandardCharsets.UTF_8),
                ImmutableList.of("asdf", "hjkl", "xyz"));
    }

    @Test
    public void testReadFileWithLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception {
        runTestReadWithData("asdf\nhjkl\nxyz".getBytes(StandardCharsets.UTF_8),
                ImmutableList.of("asdf", "hjkl", "xyz"));
    }

    @Test
    public void testReadFileWithCarriageReturnDelimiterAndNonEmptyBytesAtEnd() throws Exception {
        runTestReadWithData("asdf\rhjkl\rxyz".getBytes(StandardCharsets.UTF_8),
                ImmutableList.of("asdf", "hjkl", "xyz"));
    }

    @Test
    public void testReadFileWithCarriageReturnAndLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception {
        runTestReadWithData("asdf\r\nhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8),
                ImmutableList.of("asdf", "hjkl", "xyz"));
    }

    @Test
    public void testReadFileWithMixedDelimitersAndNonEmptyBytesAtEnd() throws Exception {
        runTestReadWithData("asdf\rhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8),
                ImmutableList.of("asdf", "hjkl", "xyz"));
    }

    private void runTestReadWithData(byte[] data, List<String> expectedResults) throws Exception {
        TextSource source = prepareSource(data);
        List<String> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create());
        assertThat(actual, containsInAnyOrder(new ArrayList<>(expectedResults).toArray(new String[0])));
    }

    @Test
    public void testSplittingSourceWithEmptyLines() throws Exception {
        TextSource source = prepareSource("\n\n\n".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testSplittingSourceWithLineFeedDelimiter() throws Exception {
        TextSource source = prepareSource("asdf\nhjkl\nxyz\n".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testSplittingSourceWithCarriageReturnDelimiter() throws Exception {
        TextSource source = prepareSource("asdf\rhjkl\rxyz\r".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testSplittingSourceWithCarriageReturnAndLineFeedDelimiter() throws Exception {
        TextSource source = prepareSource("asdf\r\nhjkl\r\nxyz\r\n".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testSplittingSourceWithMixedDelimiters() throws Exception {
        TextSource source = prepareSource("asdf\rhjkl\r\nxyz\n".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testSplittingSourceWithLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception {
        TextSource source = prepareSource("asdf\nhjkl\nxyz".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testSplittingSourceWithCarriageReturnDelimiterAndNonEmptyBytesAtEnd() throws Exception {
        TextSource source = prepareSource("asdf\rhjkl\rxyz".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testSplittingSourceWithCarriageReturnAndLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception {
        TextSource source = prepareSource("asdf\r\nhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    @Test
    public void testSplittingSourceWithMixedDelimitersAndNonEmptyBytesAtEnd() throws Exception {
        TextSource source = prepareSource("asdf\rhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8));
        SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create());
    }

    private TextSource prepareSource(byte[] data) throws IOException {
        Path path = Files.createTempFile(tempFolder, "tempfile", "ext");
        Files.write(path, data);
        return new TextSource(ValueProvider.StaticValueProvider.of(path.toString()));
    }

    @Test
    public void testInitialSplitAutoModeTxt() throws Exception {
        PipelineOptions options = TestPipeline.testingPipelineOptions();
        long desiredBundleSize = 1000;

        // Sanity check: file is at least 2 bundles long.
        assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize));

        FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).getSource();
        List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options);

        // At least 2 splits and they are equal to reading the whole file.
        assertThat(splits, hasSize(greaterThan(1)));
        SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    }

    @Test
    public void testInitialSplitAutoModeGz() throws Exception {
        long desiredBundleSize = 1000;
        PipelineOptions options = TestPipeline.testingPipelineOptions();

        // Sanity check: file is at least 2 bundles long.
        assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize));

        FileBasedSource<String> source = TextIO.read().from(largeGz.getPath()).getSource();
        List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options);

        // Exactly 1 split, even in AUTO mode, since it is a gzip file.
        assertThat(splits, hasSize(equalTo(1)));
        SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    }

    @Test
    public void testInitialSplitGzipModeTxt() throws Exception {
        PipelineOptions options = TestPipeline.testingPipelineOptions();
        long desiredBundleSize = 1000;

        // Sanity check: file is at least 2 bundles long.
        assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize));

        FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).withCompressionType(GZIP)
                .getSource();
        List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options);

        // Exactly 1 split, even though splittable text file, since using GZIP mode.
        assertThat(splits, hasSize(equalTo(1)));
        SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    }

    @Test
    public void testInitialSplitGzipModeGz() throws Exception {
        PipelineOptions options = TestPipeline.testingPipelineOptions();
        long desiredBundleSize = 1000;

        // Sanity check: file is at least 2 bundles long.
        assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize));

        FileBasedSource<String> source = TextIO.read().from(largeGz.getPath()).withCompressionType(GZIP)
                .getSource();
        List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options);

        // Exactly 1 split using .gz extension and using GZIP mode.
        assertThat(splits, hasSize(equalTo(1)));
        SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options);
    }

}