Java tutorial
/* * Copyright (C) 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.google.cloud.dataflow.sdk.io; import static com.google.cloud.dataflow.sdk.TestUtils.INTS_ARRAY; import static com.google.cloud.dataflow.sdk.TestUtils.LINES_ARRAY; import static com.google.cloud.dataflow.sdk.TestUtils.NO_INTS_ARRAY; import static com.google.cloud.dataflow.sdk.TestUtils.NO_LINES_ARRAY; import static com.google.cloud.dataflow.sdk.io.TextIO.CompressionType.AUTO; import static com.google.cloud.dataflow.sdk.io.TextIO.CompressionType.BZIP2; import static com.google.cloud.dataflow.sdk.io.TextIO.CompressionType.GZIP; import static com.google.cloud.dataflow.sdk.io.TextIO.CompressionType.UNCOMPRESSED; import static com.google.cloud.dataflow.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem; import static com.google.cloud.dataflow.sdk.transforms.display.DisplayDataMatchers.hasValue; import static com.google.cloud.dataflow.sdk.util.IOChannelUtils.resolve; import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.hasItem; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.startsWith; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import com.google.cloud.dataflow.sdk.Pipeline; import com.google.cloud.dataflow.sdk.coders.Coder; import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder; import com.google.cloud.dataflow.sdk.coders.TextualIntegerCoder; import com.google.cloud.dataflow.sdk.coders.VoidCoder; import com.google.cloud.dataflow.sdk.io.BoundedSource.BoundedReader; import com.google.cloud.dataflow.sdk.io.TextIO.CompressionType; import com.google.cloud.dataflow.sdk.io.TextIO.TextSource; import com.google.cloud.dataflow.sdk.options.PipelineOptions; import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; import com.google.cloud.dataflow.sdk.testing.DataflowAssert; import com.google.cloud.dataflow.sdk.testing.SourceTestUtils; import com.google.cloud.dataflow.sdk.testing.TestDataflowPipelineOptions; import com.google.cloud.dataflow.sdk.testing.TestPipeline; import com.google.cloud.dataflow.sdk.transforms.Create; import com.google.cloud.dataflow.sdk.transforms.PTransform; import com.google.cloud.dataflow.sdk.transforms.display.DataflowDisplayDataEvaluator; import com.google.cloud.dataflow.sdk.transforms.display.DisplayData; import com.google.cloud.dataflow.sdk.transforms.display.DisplayDataEvaluator; import com.google.cloud.dataflow.sdk.transforms.display.DisplayDataMatchers; import com.google.cloud.dataflow.sdk.util.CoderUtils; import com.google.cloud.dataflow.sdk.util.GcsUtil; import com.google.cloud.dataflow.sdk.util.IOChannelUtils; import com.google.cloud.dataflow.sdk.util.TestCredential; import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath; import com.google.cloud.dataflow.sdk.values.PCollection; import com.google.cloud.dataflow.sdk.values.PDone; import com.google.common.base.Function; import com.google.common.base.Predicate; import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; import org.mockito.Mockito; import org.mockito.invocation.InvocationOnMock; import org.mockito.stubbing.Answer; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.nio.channels.FileChannel; import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.SimpleFileVisitor; import java.nio.file.StandardOpenOption; import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Set; import java.util.zip.GZIPOutputStream; import javax.annotation.Nullable; /** * Tests for TextIO Read and Write transforms. */ @RunWith(JUnit4.class) @SuppressWarnings("unchecked") public class TextIOTest { private static final String MY_HEADER = "myHeader"; private static final String MY_FOOTER = "myFooter"; private static final String[] EMPTY = new String[] {}; private static final String[] TINY = new String[] { "Irritable eagle", "Optimistic jay", "Fanciful hawk" }; private static final String[] LARGE = makeLines(5000); private static Path tempFolder; private static File emptyTxt; private static File tinyTxt; private static File largeTxt; private static File emptyGz; private static File tinyGz; private static File largeGz; private static File emptyBzip2; private static File tinyBzip2; private static File largeBzip2; @Rule public ExpectedException expectedException = ExpectedException.none(); private GcsUtil buildMockGcsUtil() throws IOException { GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class); // Any request to open gets a new bogus channel Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))).then(new Answer<SeekableByteChannel>() { @Override public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable { return FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE); } }); // Any request for expansion returns a list containing the original GcsPath // This is required to pass validation that occurs in TextIO during apply() Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))).then(new Answer<List<GcsPath>>() { @Override public List<GcsPath> answer(InvocationOnMock invocation) throws Throwable { return ImmutableList.of((GcsPath) invocation.getArguments()[0]); } }); return mockGcsUtil; } private TestDataflowPipelineOptions buildTestPipelineOptions() { TestDataflowPipelineOptions options = PipelineOptionsFactory.as(TestDataflowPipelineOptions.class); options.setGcpCredential(new TestCredential()); return options; } private static File writeToFile(String[] lines, String filename, CompressionType compression) throws IOException { File file = tempFolder.resolve(filename).toFile(); OutputStream output = new FileOutputStream(file); switch (compression) { case UNCOMPRESSED: break; case GZIP: output = new GZIPOutputStream(output); break; case BZIP2: output = new BZip2CompressorOutputStream(output); break; default: throw new UnsupportedOperationException(compression.toString()); } writeToStreamAndClose(lines, output); return file; } @BeforeClass public static void setupClass() throws IOException { IOChannelUtils.registerStandardIOFactories(TestPipeline.testingPipelineOptions()); tempFolder = Files.createTempDirectory("TextIOTest"); // empty files emptyTxt = writeToFile(EMPTY, "empty.txt", UNCOMPRESSED); emptyGz = writeToFile(EMPTY, "empty.gz", GZIP); emptyBzip2 = writeToFile(EMPTY, "empty.bz2", BZIP2); // tiny files tinyTxt = writeToFile(TINY, "tiny.txt", UNCOMPRESSED); tinyGz = writeToFile(TINY, "tiny.gz", GZIP); tinyBzip2 = writeToFile(TINY, "tiny.bz2", BZIP2); // large files largeTxt = writeToFile(LARGE, "large.txt", UNCOMPRESSED); largeGz = writeToFile(LARGE, "large.gz", GZIP); largeBzip2 = writeToFile(LARGE, "large.bz2", BZIP2); } @AfterClass public static void testdownClass() throws IOException { Files.walkFileTree(tempFolder, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { Files.delete(file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { Files.delete(dir); return FileVisitResult.CONTINUE; } }); } private <T> void runTestRead(T[] expected, Coder<T> coder) throws Exception { File tmpFile = Files.createTempFile(tempFolder, "file", "txt").toFile(); String filename = tmpFile.getPath(); try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (T elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem); String line = new String(encodedElem); writer.println(line); } } Pipeline p = TestPipeline.create(); TextIO.Read.Bound<T> read; if (coder.equals(StringUtf8Coder.of())) { TextIO.Read.Bound<String> readStrings = TextIO.Read.from(filename); // T==String read = (TextIO.Read.Bound<T>) readStrings; } else { read = TextIO.Read.from(filename).withCoder(coder); } PCollection<T> output = p.apply(read); DataflowAssert.that(output).containsInAnyOrder(expected); p.run(); } @Test public void testReadStrings() throws Exception { runTestRead(LINES_ARRAY, StringUtf8Coder.of()); } @Test public void testReadEmptyStrings() throws Exception { runTestRead(NO_LINES_ARRAY, StringUtf8Coder.of()); } @Test public void testReadInts() throws Exception { runTestRead(INTS_ARRAY, TextualIntegerCoder.of()); } @Test public void testReadEmptyInts() throws Exception { runTestRead(NO_INTS_ARRAY, TextualIntegerCoder.of()); } @Test public void testReadNulls() throws Exception { runTestRead(new Void[] { null, null, null }, VoidCoder.of()); } @Test public void testReadNamed() throws Exception { Pipeline p = TestPipeline.create(); assertEquals("TextIO.Read/Read.out", p.apply(TextIO.Read.withoutValidation().from("somefile")).getName()); assertEquals("MyRead/Read.out", p.apply("MyRead", TextIO.Read.withoutValidation().from("somefile")).getName()); assertEquals("HerRead/Read.out", p.apply(TextIO.Read.withoutValidation().named("HerRead").from("somefile")).getName()); } @Test public void testReadDisplayData() { TextIO.Read.Bound<?> read = TextIO.Read.from("foo.*").withCompressionType(BZIP2).withoutValidation(); DisplayData displayData = DisplayData.from(read); assertThat(displayData, hasDisplayItem("filePattern", "foo.*")); assertThat(displayData, hasDisplayItem("compressionType", BZIP2.toString())); assertThat(displayData, hasDisplayItem("validation", false)); } private <T> void runTestWrite(T[] elems, Coder<T> coder) throws Exception { runTestWrite(elems, null, null, coder, 1); } private <T> void runTestWrite(T[] elems, Coder<T> coder, int numShards) throws Exception { runTestWrite(elems, null, null, coder, numShards); } private <T> void runTestWrite(T[] elems, Coder<T> coder, String header, String footer) throws Exception { runTestWrite(elems, header, footer, coder, 1); } private <T> void runTestWrite(T[] elems, String header, String footer, Coder<T> coder, int numShards) throws Exception { String outputName = "file.txt"; Path baseDir = Files.createTempDirectory(tempFolder, "testwrite"); String baseFilename = baseDir.resolve(outputName).toString(); Pipeline p = TestPipeline.create(); PCollection<T> input = p.apply(Create.of(Arrays.asList(elems)).withCoder(coder)); TextIO.Write.Bound<T> write; if (coder.equals(StringUtf8Coder.of())) { TextIO.Write.Bound<String> writeStrings = TextIO.Write.to(baseFilename); // T==String write = (TextIO.Write.Bound<T>) writeStrings; } else { write = TextIO.Write.to(baseFilename).withCoder(coder); } if (numShards == 1) { write = write.withoutSharding(); } else { write = write.withNumShards(numShards).withShardNameTemplate(ShardNameTemplate.INDEX_OF_MAX); } write = write.withHeader(header).withFooter(footer); input.apply(write); p.run(); assertOutputFiles(elems, header, footer, coder, numShards, baseDir, outputName, write.getShardNameTemplate()); } public static <T> void assertOutputFiles(T[] elems, final String header, final String footer, Coder<T> coder, int numShards, Path rootLocation, String outputName, String shardNameTemplate) throws Exception { List<File> expectedFiles = new ArrayList<>(); if (numShards == 0) { String pattern = resolve(rootLocation.toAbsolutePath().toString(), outputName + "*"); for (String expected : IOChannelUtils.getFactory(pattern).match(pattern)) { expectedFiles.add(new File(expected)); } } else { for (int i = 0; i < numShards; i++) { expectedFiles.add(new File(rootLocation.toString(), IOChannelUtils.constructName(outputName, shardNameTemplate, "", i, numShards))); } } List<List<String>> actual = new ArrayList<>(); for (File tmpFile : expectedFiles) { try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) { List<String> currentFile = new ArrayList<>(); for (;;) { String line = reader.readLine(); if (line == null) { break; } currentFile.add(line); } actual.add(currentFile); } } List<String> expectedElements = new ArrayList<>(elems.length); for (T elem : elems) { byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem); String line = new String(encodedElem); expectedElements.add(line); } List<String> actualElements = Lists.newArrayList(Iterables .concat(FluentIterable.from(actual).transform(removeHeaderAndFooter(header, footer)).toList())); assertThat(actualElements, containsInAnyOrder(expectedElements.toArray())); assertTrue(Iterables.all(actual, haveProperHeaderAndFooter(header, footer))); } private static Function<List<String>, List<String>> removeHeaderAndFooter(final String header, final String footer) { return new Function<List<String>, List<String>>() { @Nullable @Override public List<String> apply(List<String> lines) { ArrayList<String> newLines = Lists.newArrayList(lines); if (header != null) { newLines.remove(0); } if (footer != null) { int last = newLines.size() - 1; newLines.remove(last); } return newLines; } }; } private static Predicate<List<String>> haveProperHeaderAndFooter(final String header, final String footer) { return new Predicate<List<String>>() { @Override public boolean apply(List<String> fileLines) { int last = fileLines.size() - 1; return (header == null || fileLines.get(0).equals(header)) && (footer == null || fileLines.get(last).equals(footer)); } }; } @Test public void testWriteStrings() throws Exception { runTestWrite(LINES_ARRAY, StringUtf8Coder.of()); } @Test public void testWriteEmptyStrings() throws Exception { runTestWrite(NO_LINES_ARRAY, StringUtf8Coder.of()); } @Test public void testWriteInts() throws Exception { runTestWrite(INTS_ARRAY, TextualIntegerCoder.of()); } @Test public void testWriteEmptyInts() throws Exception { runTestWrite(NO_INTS_ARRAY, TextualIntegerCoder.of()); } @Test public void testWriteNamed() { { PTransform<PCollection<String>, PDone> transform1 = TextIO.Write.to("/tmp/file.txt"); assertEquals("TextIO.Write", transform1.getName()); } { PTransform<PCollection<String>, PDone> transform2 = TextIO.Write.named("MyWrite").to("/tmp/file.txt"); assertEquals("MyWrite", transform2.getName()); } { PTransform<PCollection<String>, PDone> transform3 = TextIO.Write.to("/tmp/file.txt").named("HerWrite"); assertEquals("HerWrite", transform3.getName()); } } @Test public void testShardedWrite() throws Exception { runTestWrite(LINES_ARRAY, StringUtf8Coder.of(), 5); } @Test public void testWriteWithHeader() throws Exception { runTestWrite(LINES_ARRAY, StringUtf8Coder.of(), MY_HEADER, null); } @Test public void testWriteWithFooter() throws Exception { runTestWrite(LINES_ARRAY, StringUtf8Coder.of(), null, MY_FOOTER); } @Test public void testWriteWithHeaderAndFooter() throws Exception { runTestWrite(LINES_ARRAY, StringUtf8Coder.of(), MY_HEADER, MY_FOOTER); } @Test public void testWriteDisplayData() { TextIO.Write.Bound<?> write = TextIO.Write.to("foo").withSuffix("bar").withShardNameTemplate("-SS-of-NN-") .withNumShards(100).withFooter("myFooter").withHeader("myHeader").withoutValidation(); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("filePrefix", "foo")); assertThat(displayData, hasDisplayItem("fileSuffix", "bar")); assertThat(displayData, hasDisplayItem("fileHeader", "myHeader")); assertThat(displayData, hasDisplayItem("fileFooter", "myFooter")); assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-")); assertThat(displayData, hasDisplayItem("numShards", 100)); assertThat(displayData, hasDisplayItem("validation", false)); } @Test public void testPrimitiveWriteDisplayData() { DisplayDataEvaluator evaluator = DataflowDisplayDataEvaluator.create(); TextIO.Write.Bound<?> write = TextIO.Write.to("foobar"); Set<DisplayData> displayData = evaluator.displayDataForPrimitiveTransforms(write); assertThat("TextIO.Write should include the file prefix in its primitive display data", displayData, hasItem(hasDisplayItem(hasValue(startsWith("foobar"))))); } @Test public void testPrimitiveReadDisplayData() { DisplayDataEvaluator evaluator = DataflowDisplayDataEvaluator.create(); TextIO.Read.Bound<String> read = TextIO.Read.from("foobar").withoutValidation(); Set<DisplayData> displayData = evaluator.displayDataForPrimitiveTransforms(read); assertThat("TextIO.Read should include the file prefix in its primitive display data", displayData, hasItem(hasDisplayItem(DisplayDataMatchers.hasValue(startsWith("foobar"))))); } @Test public void testUnsupportedFilePattern() throws IOException { // Windows doesn't like resolving paths with * in them. String filename = tempFolder.resolve("output@5").toString(); Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of())); expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("Output name components are not allowed to contain"); input.apply(TextIO.Write.to(filename)); } /** * This tests a few corner cases that should not crash. */ @Test public void testGoodWildcards() throws Exception { TestDataflowPipelineOptions options = buildTestPipelineOptions(); options.setGcsUtil(buildMockGcsUtil()); Pipeline pipeline = Pipeline.create(options); applyRead(pipeline, "gs://bucket/foo"); applyRead(pipeline, "gs://bucket/foo/"); applyRead(pipeline, "gs://bucket/foo/*"); applyRead(pipeline, "gs://bucket/foo/?"); applyRead(pipeline, "gs://bucket/foo/[0-9]"); applyRead(pipeline, "gs://bucket/foo/*baz*"); applyRead(pipeline, "gs://bucket/foo/*baz?"); applyRead(pipeline, "gs://bucket/foo/[0-9]baz?"); applyRead(pipeline, "gs://bucket/foo/baz/*"); applyRead(pipeline, "gs://bucket/foo/baz/*wonka*"); applyRead(pipeline, "gs://bucket/foo/*baz/wonka*"); applyRead(pipeline, "gs://bucket/foo*/baz"); applyRead(pipeline, "gs://bucket/foo?/baz"); applyRead(pipeline, "gs://bucket/foo[0-9]/baz"); // Check that running doesn't fail. pipeline.run(); } private void applyRead(Pipeline pipeline, String path) { pipeline.apply("Read(" + path + ")", TextIO.Read.from(path)); } /** * Recursive wildcards are not supported. * This tests "**". */ @Test public void testBadWildcardRecursive() throws Exception { Pipeline pipeline = TestPipeline.create(); // Check that applying does fail. expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("wildcard"); pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz")); } @Test public void testReadWithoutValidationFlag() throws Exception { TextIO.Read.Bound<String> read = TextIO.Read.from("gs://bucket/foo*/baz"); assertTrue(read.needsValidation()); assertFalse(read.withoutValidation().needsValidation()); } @Test public void testWriteWithoutValidationFlag() throws Exception { TextIO.Write.Bound<String> write = TextIO.Write.to("gs://bucket/foo/baz"); assertTrue(write.needsValidation()); assertFalse(write.withoutValidation().needsValidation()); } @Test public void testCompressionTypeIsSet() throws Exception { TextIO.Read.Bound<String> read = TextIO.Read.from("gs://bucket/test"); assertEquals(AUTO, read.getCompressionType()); read = TextIO.Read.from("gs://bucket/test").withCompressionType(GZIP); assertEquals(GZIP, read.getCompressionType()); } /** * Helper that writes the given lines (adding a newline in between) to a stream, then closes the * stream. */ private static void writeToStreamAndClose(String[] lines, OutputStream outputStream) { try (PrintStream writer = new PrintStream(outputStream)) { for (String line : lines) { writer.println(line); } } } /** * Helper method that runs TextIO.Read.from(filename).withCompressionType(compressionType) * and asserts that the results match the given expected output. */ private static void assertReadingCompressedFileMatchesExpected(File file, CompressionType compressionType, String[] expected) { Pipeline p = TestPipeline.create(); TextIO.Read.Bound<String> read = TextIO.Read.from(file.getPath()).withCompressionType(compressionType); PCollection<String> output = p.apply(read); DataflowAssert.that(output).containsInAnyOrder(expected); p.run(); } /** Helper to make an array of compressible strings. Returns ["word"+i] for i in range(0,n). */ private static String[] makeLines(int n) { String[] ret = new String[n]; for (int i = 0; i < n; ++i) { ret[i] = "word" + i; } return ret; } /** Tests reading from a small, gzipped file with no .gz extension but GZIP compression set. */ @Test public void testSmallCompressedGzipReadNoExtension() throws Exception { File smallGzNoExtension = writeToFile(TINY, "tiny_gz_no_extension", GZIP); assertReadingCompressedFileMatchesExpected(smallGzNoExtension, GZIP, TINY); } /** * Tests reading from a small, uncompressed file with .gz extension. * This must work in AUTO or GZIP modes. This is needed because some network file systems / HTTP * clients will transparently decompress gzipped content. */ @Test public void testSmallCompressedGzipReadActuallyUncompressed() throws Exception { File smallGzNotCompressed = writeToFile(TINY, "tiny_uncompressed.gz", UNCOMPRESSED); // Should work with GZIP compression set. assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, GZIP, TINY); // Should also work with AUTO mode set. assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, AUTO, TINY); } /** Tests reading from a small, bzip2ed file with no .bz2 extension but BZIP2 compression set. */ @Test public void testSmallCompressedBzip2ReadNoExtension() throws Exception { File smallBz2NoExtension = writeToFile(TINY, "tiny_bz2_no_extension", BZIP2); assertReadingCompressedFileMatchesExpected(smallBz2NoExtension, BZIP2, TINY); } @Test public void testTxtRead() throws Exception { // Files with non-compressed extensions should work in AUTO and UNCOMPRESSED modes. for (CompressionType type : new CompressionType[] { AUTO, UNCOMPRESSED }) { assertReadingCompressedFileMatchesExpected(emptyTxt, type, EMPTY); assertReadingCompressedFileMatchesExpected(tinyTxt, type, TINY); assertReadingCompressedFileMatchesExpected(largeTxt, type, LARGE); } } @Test public void testGzipCompressedRead() throws Exception { // Files with the right extensions should work in AUTO and GZIP modes. for (CompressionType type : new CompressionType[] { AUTO, GZIP }) { assertReadingCompressedFileMatchesExpected(emptyGz, type, EMPTY); assertReadingCompressedFileMatchesExpected(tinyGz, type, TINY); assertReadingCompressedFileMatchesExpected(largeGz, type, LARGE); } // Sanity check that we're properly testing compression. assertThat(largeTxt.length(), greaterThan(largeGz.length())); // GZIP files with non-gz extension should work in GZIP mode. File gzFile = writeToFile(TINY, "tiny_gz_no_extension", GZIP); assertReadingCompressedFileMatchesExpected(gzFile, GZIP, TINY); } @Test public void testBzip2CompressedRead() throws Exception { // Files with the right extensions should work in AUTO and BZIP2 modes. for (CompressionType type : new CompressionType[] { AUTO, BZIP2 }) { assertReadingCompressedFileMatchesExpected(emptyBzip2, type, EMPTY); assertReadingCompressedFileMatchesExpected(tinyBzip2, type, TINY); assertReadingCompressedFileMatchesExpected(largeBzip2, type, LARGE); } // Sanity check that we're properly testing compression. assertThat(largeTxt.length(), greaterThan(largeBzip2.length())); // BZ2 files with non-bz2 extension should work in BZIP2 mode. File bz2File = writeToFile(TINY, "tiny_bz2_no_extension", BZIP2); assertReadingCompressedFileMatchesExpected(bz2File, BZIP2, TINY); } @Test public void testTextIOGetName() { assertEquals("TextIO.Read", TextIO.Read.from("somefile").getName()); assertEquals("TextIO.Write", TextIO.Write.to("somefile").getName()); assertEquals("ReadMyFile", TextIO.Read.named("ReadMyFile").from("somefile").getName()); assertEquals("WriteMyFile", TextIO.Write.named("WriteMyFile").to("somefile").getName()); assertEquals("TextIO.Read", TextIO.Read.from("somefile").toString()); assertEquals("ReadMyFile [TextIO.Read]", TextIO.Read.named("ReadMyFile").from("somefile").toString()); } @Test public void testProgressEmptyFile() throws IOException { try (BoundedReader<String> reader = prepareSource(new byte[0]) .createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting. assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Assert empty assertFalse(reader.start()); // Check postconditions after finishing assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testProgressTextFile() throws IOException { String file = "line1\nline2\nline3"; try (BoundedReader<String> reader = prepareSource(file.getBytes()) .createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 1 assertTrue(reader.start()); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 2 assertTrue(reader.advance()); assertEquals(1, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 3 assertTrue(reader.advance()); assertEquals(2, reader.getSplitPointsConsumed()); assertEquals(1, reader.getSplitPointsRemaining()); // Check postconditions after finishing assertFalse(reader.advance()); assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(3, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testProgressAfterSplitting() throws IOException { String file = "line1\nline2\nline3"; BoundedSource source = prepareSource(file.getBytes()); BoundedSource remainder; // Create the remainder, verifying properties pre- and post-splitting. try (BoundedReader<String> readerOrig = source.createReader(PipelineOptionsFactory.create())) { // Preconditions. assertEquals(0.0, readerOrig.getFractionConsumed(), 1e-6); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining()); // First record, before splitting. assertTrue(readerOrig.start()); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining()); // Split. 0.1 is in line1, so should now be able to detect last record. remainder = readerOrig.splitAtFraction(0.1); System.err.println(readerOrig.getCurrentSource()); assertNotNull(remainder); // First record, after splitting. assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(1, readerOrig.getSplitPointsRemaining()); // Finish and postconditions. assertFalse(readerOrig.advance()); assertEquals(1.0, readerOrig.getFractionConsumed(), 1e-6); assertEquals(1, readerOrig.getSplitPointsConsumed()); assertEquals(0, readerOrig.getSplitPointsRemaining()); } // Check the properties of the remainder. try (BoundedReader<String> reader = remainder.createReader(PipelineOptionsFactory.create())) { // Preconditions. assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // First record should be line 2. assertTrue(reader.start()); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Second record is line 3 assertTrue(reader.advance()); assertEquals(1, reader.getSplitPointsConsumed()); assertEquals(1, reader.getSplitPointsRemaining()); // Check postconditions after finishing assertFalse(reader.advance()); assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(2, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testReadEmptyLines() throws Exception { runTestReadWithData("\n\n\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("", "", "")); } @Test public void testReadFileWithLineFeedDelimiter() throws Exception { runTestReadWithData("asdf\nhjkl\nxyz\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithCarriageReturnDelimiter() throws Exception { runTestReadWithData("asdf\rhjkl\rxyz\r".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithCarriageReturnAndLineFeedDelimiter() throws Exception { runTestReadWithData("asdf\r\nhjkl\r\nxyz\r\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithMixedDelimiters() throws Exception { runTestReadWithData("asdf\rhjkl\r\nxyz\n".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception { runTestReadWithData("asdf\nhjkl\nxyz".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithCarriageReturnDelimiterAndNonEmptyBytesAtEnd() throws Exception { runTestReadWithData("asdf\rhjkl\rxyz".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithCarriageReturnAndLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception { runTestReadWithData("asdf\r\nhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } @Test public void testReadFileWithMixedDelimitersAndNonEmptyBytesAtEnd() throws Exception { runTestReadWithData("asdf\rhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8), ImmutableList.of("asdf", "hjkl", "xyz")); } private void runTestReadWithData(byte[] data, List<String> expectedResults) throws Exception { TextSource<String> source = prepareSource(data); List<String> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertThat(actual, containsInAnyOrder(new ArrayList<>(expectedResults).toArray(new String[0]))); } @Test public void testSplittingSourceWithEmptyLines() throws Exception { TextSource<String> source = prepareSource("\n\n\n".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithLineFeedDelimiter() throws Exception { TextSource<String> source = prepareSource("asdf\nhjkl\nxyz\n".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithCarriageReturnDelimiter() throws Exception { TextSource<String> source = prepareSource("asdf\rhjkl\rxyz\r".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithCarriageReturnAndLineFeedDelimiter() throws Exception { TextSource<String> source = prepareSource("asdf\r\nhjkl\r\nxyz\r\n".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithMixedDelimiters() throws Exception { TextSource<String> source = prepareSource("asdf\rhjkl\r\nxyz\n".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception { TextSource<String> source = prepareSource("asdf\nhjkl\nxyz".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithCarriageReturnDelimiterAndNonEmptyBytesAtEnd() throws Exception { TextSource<String> source = prepareSource("asdf\rhjkl\rxyz".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithCarriageReturnAndLineFeedDelimiterAndNonEmptyBytesAtEnd() throws Exception { TextSource<String> source = prepareSource("asdf\r\nhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } @Test public void testSplittingSourceWithMixedDelimitersAndNonEmptyBytesAtEnd() throws Exception { TextSource<String> source = prepareSource("asdf\rhjkl\r\nxyz".getBytes(StandardCharsets.UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } private TextSource<String> prepareSource(byte[] data) throws IOException { Path path = Files.createTempFile(tempFolder, "tempfile", "ext"); Files.write(path, data); return new TextSource<>(path.toString(), StringUtf8Coder.of()); } @Test public void testInitialSplitIntoBundlesAutoModeTxt() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; // Sanity check: file is at least 2 bundles long. assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.Read.from(largeTxt.getPath()).getSource(); List<? extends FileBasedSource<String>> splits = source.splitIntoBundles(desiredBundleSize, options); // At least 2 splits and they are equal to reading the whole file. assertThat(splits, hasSize(greaterThan(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testInitialSplitIntoBundlesAutoModeGz() throws Exception { long desiredBundleSize = 1000; PipelineOptions options = TestPipeline.testingPipelineOptions(); // Sanity check: file is at least 2 bundles long. assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.Read.from(largeGz.getPath()).getSource(); List<? extends FileBasedSource<String>> splits = source.splitIntoBundles(desiredBundleSize, options); // Exactly 1 split, even in AUTO mode, since it is a gzip file. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testInitialSplitIntoBundlesGzipModeTxt() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; // Sanity check: file is at least 2 bundles long. assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.Read.from(largeTxt.getPath()).withCompressionType(GZIP).getSource(); List<? extends FileBasedSource<String>> splits = source.splitIntoBundles(desiredBundleSize, options); // Exactly 1 split, even though splittable text file, since using GZIP mode. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testInitialSplitIntoBundlesGzipModeGz() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; // Sanity check: file is at least 2 bundles long. assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.Read.from(largeGz.getPath()).withCompressionType(GZIP).getSource(); List<? extends FileBasedSource<String>> splits = source.splitIntoBundles(desiredBundleSize, options); // Exactly 1 split using .gz extension and using GZIP mode. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } }