Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.beam.sdk.TestUtils.LINES_ARRAY; import static org.apache.beam.sdk.TestUtils.NO_LINES_ARRAY; import static org.apache.beam.sdk.io.Compression.AUTO; import static org.apache.beam.sdk.io.Compression.BZIP2; import static org.apache.beam.sdk.io.Compression.DEFLATE; import static org.apache.beam.sdk.io.Compression.GZIP; import static org.apache.beam.sdk.io.Compression.UNCOMPRESSED; import static org.apache.beam.sdk.io.Compression.ZIP; import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem; import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasValue; import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.hasItem; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.startsWith; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.io.fs.EmptyMatchTreatment; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.testing.NeedsRunner; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.SourceTestUtils; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.testing.UsesSplittableParDo; import org.apache.beam.sdk.testing.ValidatesRunner; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.ToString; import org.apache.beam.sdk.transforms.Watch; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.transforms.display.DisplayDataEvaluator; import org.apache.beam.sdk.transforms.windowing.AfterPane; import org.apache.beam.sdk.transforms.windowing.FixedWindows; import org.apache.beam.sdk.transforms.windowing.Repeatedly; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.PCollection; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; import org.apache.commons.compress.compressors.deflate.DeflateCompressorOutputStream; import org.joda.time.Duration; import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.experimental.runners.Enclosed; import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; import org.junit.runners.Parameterized; /** Tests for {@link TextIO.Read}. */ @RunWith(Enclosed.class) public class TextIOReadTest { private static final int LINES_NUMBER_FOR_LARGE = 1000; private static final List<String> EMPTY = Collections.emptyList(); private static final List<String> TINY = Arrays.asList("Irritable eagle", "Optimistic jay", "Fanciful hawk"); private static final List<String> LARGE = makeLines(LINES_NUMBER_FOR_LARGE); private static File writeToFile(List<String> lines, TemporaryFolder folder, String fileName, Compression compression) throws IOException { File file = folder.getRoot().toPath().resolve(fileName).toFile(); OutputStream output = new FileOutputStream(file); switch (compression) { case UNCOMPRESSED: break; case GZIP: output = new GZIPOutputStream(output); break; case BZIP2: output = new BZip2CompressorOutputStream(output); break; case ZIP: ZipOutputStream zipOutput = new ZipOutputStream(output); zipOutput.putNextEntry(new ZipEntry("entry")); output = zipOutput; break; case DEFLATE: output = new DeflateCompressorOutputStream(output); break; default: throw new UnsupportedOperationException(compression.toString()); } writeToStreamAndClose(lines, output); return file; } /** * Helper that writes the given lines (adding a newline in between) to a stream, then closes the * stream. */ private static void writeToStreamAndClose(List<String> lines, OutputStream outputStream) { try (PrintStream writer = new PrintStream(outputStream)) { for (String line : lines) { writer.println(line); } } } /** Helper to make an array of compressible strings. Returns ["word"i] for i in range(0,n). */ private static List<String> makeLines(int n) { List<String> ret = new ArrayList<>(); for (int i = 0; i < n; ++i) { ret.add("word" + i); } return ret; } /** * Helper method that runs a variety of ways to read a single file using TextIO and checks that * they all match the given expected output. * * <p>The transforms being verified are: * <ul> * <li>TextIO.read().from(filename).withCompression(compressionType) * <li>TextIO.read().from(filename).withCompression(compressionType) * .withHintMatchesManyFiles() * <li>TextIO.readAll().withCompression(compressionType) * </ul> */ private static void assertReadingCompressedFileMatchesExpected(File file, Compression compression, List<String> expected, Pipeline p) { TextIO.Read read = TextIO.read().from(file.getPath()).withCompression(compression); PAssert.that(p.apply("Read_" + file + "_" + compression.toString(), read)).containsInAnyOrder(expected); PAssert.that( p.apply("Read_" + file + "_" + compression.toString() + "_many", read.withHintMatchesManyFiles())) .containsInAnyOrder(expected); TextIO.ReadAll readAll = TextIO.readAll().withCompression(compression); PAssert.that(p.apply("Create_" + file, Create.of(file.getPath())).apply("Read_" + compression.toString(), readAll)).containsInAnyOrder(expected); } /** * Create a zip file with the given lines. * * @param expected A list of expected lines, populated in the zip file. * @param folder A temporary folder used to create files. * @param filename Optionally zip file name (can be null). * @param fieldsEntries Fields to write in zip entries. * @return The zip filename. * @throws Exception In case of a failure during zip file creation. */ private static File createZipFile(List<String> expected, TemporaryFolder folder, String filename, String[]... fieldsEntries) throws Exception { File tmpFile = folder.getRoot().toPath().resolve(filename).toFile(); ZipOutputStream out = new ZipOutputStream(new FileOutputStream(tmpFile)); PrintStream writer = new PrintStream(out, true /* auto-flush on write */); int index = 0; for (String[] entry : fieldsEntries) { out.putNextEntry(new ZipEntry(Integer.toString(index))); for (String field : entry) { writer.println(field); expected.add(field); } out.closeEntry(); index++; } writer.close(); out.close(); return tmpFile; } private static TextSource prepareSource(TemporaryFolder temporaryFolder, byte[] data, byte[] delimiter) throws IOException { Path path = temporaryFolder.newFile().toPath(); Files.write(path, data); return new TextSource(ValueProvider.StaticValueProvider.of(path.toString()), EmptyMatchTreatment.DISALLOW, delimiter); } private static String getFileSuffix(Compression compression) { switch (compression) { case UNCOMPRESSED: return ".txt"; case GZIP: return ".gz"; case BZIP2: return ".bz2"; case ZIP: return ".zip"; case DEFLATE: return ".deflate"; default: return ""; } } /** Tests for reading from different size of files with various Compression. */ @RunWith(Parameterized.class) public static class CompressedReadTest { @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); @Rule public TestPipeline p = TestPipeline.create(); @Parameterized.Parameters(name = "{index}: {1}") public static Iterable<Object[]> data() { return ImmutableList.<Object[]>builder().add(new Object[] { EMPTY, UNCOMPRESSED }) .add(new Object[] { EMPTY, GZIP }).add(new Object[] { EMPTY, BZIP2 }) .add(new Object[] { EMPTY, ZIP }).add(new Object[] { EMPTY, DEFLATE }) .add(new Object[] { TINY, UNCOMPRESSED }).add(new Object[] { TINY, GZIP }) .add(new Object[] { TINY, BZIP2 }).add(new Object[] { TINY, ZIP }) .add(new Object[] { TINY, DEFLATE }).add(new Object[] { LARGE, UNCOMPRESSED }) .add(new Object[] { LARGE, GZIP }).add(new Object[] { LARGE, BZIP2 }) .add(new Object[] { LARGE, ZIP }).add(new Object[] { LARGE, DEFLATE }).build(); } @Parameterized.Parameter(0) public List<String> lines; @Parameterized.Parameter(1) public Compression compression; /** Tests reading from a small, compressed file with no extension. */ @Test @Category(NeedsRunner.class) public void testCompressedReadWithoutExtension() throws Exception { String fileName = lines.size() + "_" + compression + "_no_extension"; File fileWithNoExtension = writeToFile(lines, tempFolder, fileName, compression); assertReadingCompressedFileMatchesExpected(fileWithNoExtension, compression, lines, p); p.run(); } @Test @Category(NeedsRunner.class) public void testCompressedReadWithExtension() throws Exception { String fileName = lines.size() + "_" + compression + "_no_extension" + getFileSuffix(compression); File fileWithExtension = writeToFile(lines, tempFolder, fileName, compression); // Sanity check that we're properly testing compression. if (lines.size() == LINES_NUMBER_FOR_LARGE && !compression.equals(UNCOMPRESSED)) { File uncompressedFile = writeToFile(lines, tempFolder, "large.txt", UNCOMPRESSED); assertThat(uncompressedFile.length(), greaterThan(fileWithExtension.length())); } assertReadingCompressedFileMatchesExpected(fileWithExtension, compression, lines, p); p.run(); } @Test @Category(NeedsRunner.class) public void testReadWithAuto() throws Exception { // Files with non-compressed extensions should work in AUTO and UNCOMPRESSED modes. String fileName = lines.size() + "_" + compression + "_no_extension" + getFileSuffix(compression); File fileWithExtension = writeToFile(lines, tempFolder, fileName, compression); assertReadingCompressedFileMatchesExpected(fileWithExtension, AUTO, lines, p); p.run(); } } /** Tests for reading files with various delimiters. */ @RunWith(Parameterized.class) public static class ReadWithDelimiterTest { private static final ImmutableList<String> EXPECTED = ImmutableList.of("asdf", "hjkl", "xyz"); @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); @Parameterized.Parameters(name = "{index}: {0}") public static Iterable<Object[]> data() { return ImmutableList.<Object[]>builder().add(new Object[] { "\n\n\n", ImmutableList.of("", "", "") }) .add(new Object[] { "asdf\nhjkl\nxyz\n", EXPECTED }) .add(new Object[] { "asdf\rhjkl\rxyz\r", EXPECTED }) .add(new Object[] { "asdf\r\nhjkl\r\nxyz\r\n", EXPECTED }) .add(new Object[] { "asdf\rhjkl\r\nxyz\n", EXPECTED }) .add(new Object[] { "asdf\nhjkl\nxyz", EXPECTED }) .add(new Object[] { "asdf\rhjkl\rxyz", EXPECTED }) .add(new Object[] { "asdf\r\nhjkl\r\nxyz", EXPECTED }) .add(new Object[] { "asdf\rhjkl\r\nxyz", EXPECTED }).build(); } @Parameterized.Parameter(0) public String line; @Parameterized.Parameter(1) public ImmutableList<String> expected; @Test public void testReadLinesWithDelimiter() throws Exception { runTestReadWithData(line.getBytes(UTF_8), expected); } @Test public void testSplittingSource() throws Exception { TextSource source = prepareSource(line.getBytes(UTF_8)); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); } private TextSource prepareSource(byte[] data) throws IOException { return TextIOReadTest.prepareSource(tempFolder, data, null); } private void runTestReadWithData(byte[] data, List<String> expectedResults) throws Exception { TextSource source = prepareSource(data); List<String> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertThat(actual, containsInAnyOrder(new ArrayList<>(expectedResults).toArray(new String[0]))); } } /** Tests for some basic operations in {@link TextIO.Read}. */ @RunWith(JUnit4.class) public static class BasicIOTest { @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); @Rule public TestPipeline p = TestPipeline.create(); private void runTestRead(String[] expected) throws Exception { File tmpFile = tempFolder.newFile(); String filename = tmpFile.getPath(); try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (String elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); String line = new String(encodedElem); writer.println(line); } } TextIO.Read read = TextIO.read().from(filename); PCollection<String> output = p.apply(read); PAssert.that(output).containsInAnyOrder(expected); p.run(); } @Test public void testDelimiterSelfOverlaps() { assertFalse(TextIO.Read.isSelfOverlapping(new byte[] { 'a', 'b', 'c' })); assertFalse(TextIO.Read.isSelfOverlapping(new byte[] { 'c', 'a', 'b', 'd', 'a', 'b' })); assertFalse(TextIO.Read.isSelfOverlapping(new byte[] { 'a', 'b', 'c', 'a', 'b', 'd' })); assertTrue(TextIO.Read.isSelfOverlapping(new byte[] { 'a', 'b', 'a' })); assertTrue(TextIO.Read.isSelfOverlapping(new byte[] { 'a', 'b', 'c', 'a', 'b' })); } @Test @Category(NeedsRunner.class) public void testReadStringsWithCustomDelimiter() throws Exception { final String[] inputStrings = new String[] { // incomplete delimiter "To be, or not to be: that |is the question: ", // incomplete delimiter "To be, or not to be: that *is the question: ", // complete delimiter "Whether 'tis nobler in the mind to suffer |*", // truncated delimiter "The slings and arrows of outrageous fortune,|" }; File tmpFile = tempFolder.newFile("tmpfile.txt"); String filename = tmpFile.getPath(); try (FileWriter writer = new FileWriter(tmpFile)) { writer.write(Joiner.on("").join(inputStrings)); } PAssert.that(p.apply(TextIO.read().from(filename).withDelimiter(new byte[] { '|', '*' }))) .containsInAnyOrder( "To be, or not to be: that |is the question: To be, or not to be: " + "that *is the question: Whether 'tis nobler in the mind to suffer ", "The slings and arrows of outrageous fortune,|"); p.run(); } @Test public void testSplittingSourceWithCustomDelimiter() throws Exception { List<String> testCases = Lists.newArrayList(); String infix = "first|*second|*|*third"; String[] affixes = new String[] { "", "|", "*", "|*" }; for (String prefix : affixes) { for (String suffix : affixes) { testCases.add(prefix + infix + suffix); } } for (String testCase : testCases) { SourceTestUtils.assertSplitAtFractionExhaustive( TextIOReadTest.prepareSource(tempFolder, testCase.getBytes(UTF_8), new byte[] { '|', '*' }), PipelineOptionsFactory.create()); } } @Test @Category(NeedsRunner.class) public void testReadStrings() throws Exception { runTestRead(LINES_ARRAY); } @Test @Category(NeedsRunner.class) public void testReadEmptyStrings() throws Exception { runTestRead(NO_LINES_ARRAY); } @Test public void testReadNamed() throws Exception { File emptyFile = tempFolder.newFile(); p.enableAbandonedNodeEnforcement(false); assertEquals("TextIO.Read/Read.out", p.apply(TextIO.read().from("somefile")).getName()); assertEquals("MyRead/Read.out", p.apply("MyRead", TextIO.read().from(emptyFile.getPath())).getName()); } @Test public void testReadDisplayData() { TextIO.Read read = TextIO.read().from("foo.*").withCompression(BZIP2); DisplayData displayData = DisplayData.from(read); assertThat(displayData, hasDisplayItem("filePattern", "foo.*")); assertThat(displayData, hasDisplayItem("compressionType", BZIP2.toString())); } @Test @Category(ValidatesRunner.class) public void testPrimitiveReadDisplayData() { DisplayDataEvaluator evaluator = DisplayDataEvaluator.create(); TextIO.Read read = TextIO.read().from("foobar"); Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read); assertThat("TextIO.Read should include the file prefix in its primitive display data", displayData, hasItem(hasDisplayItem(hasValue(startsWith("foobar"))))); } /** Options for testing. */ public interface RuntimeTestOptions extends PipelineOptions { ValueProvider<String> getInput(); void setInput(ValueProvider<String> value); } @Test public void testRuntimeOptionsNotCalledInApply() throws Exception { p.enableAbandonedNodeEnforcement(false); RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class); p.apply(TextIO.read().from(options.getInput())); } @Test public void testCompressionIsSet() throws Exception { TextIO.Read read = TextIO.read().from("/tmp/test"); assertEquals(AUTO, read.getCompression()); read = TextIO.read().from("/tmp/test").withCompression(GZIP); assertEquals(GZIP, read.getCompression()); } /** * Tests reading from a small, uncompressed file with .gz extension. This must work in * GZIP modes. This is needed because some network file systems / HTTP clients will * transparently decompress gzipped content. */ @Test @Category(NeedsRunner.class) public void testSmallCompressedGzipReadActuallyUncompressed() throws Exception { File smallGzNotCompressed = writeToFile(TINY, tempFolder, "tiny_uncompressed.gz", UNCOMPRESSED); // Should work with GZIP compression set. assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, GZIP, TINY, p); p.run(); } /** * Tests reading from a small, uncompressed file with .gz extension. This must work in * AUTO modes. This is needed because some network file systems / HTTP clients will * transparently decompress gzipped content. */ @Test @Category(NeedsRunner.class) public void testSmallCompressedAutoReadActuallyUncompressed() throws Exception { File smallGzNotCompressed = writeToFile(TINY, tempFolder, "tiny_uncompressed.gz", UNCOMPRESSED); // Should also work with AUTO mode set. assertReadingCompressedFileMatchesExpected(smallGzNotCompressed, AUTO, TINY, p); p.run(); } /** * Tests a zip file with no entries. This is a corner case not tested elsewhere as the default * test zip files have a single entry. */ @Test @Category(NeedsRunner.class) public void testZipCompressedReadWithNoEntries() throws Exception { File file = createZipFile(new ArrayList<String>(), tempFolder, "empty zip file"); assertReadingCompressedFileMatchesExpected(file, ZIP, EMPTY, p); p.run(); } /** * Tests a zip file with multiple entries. This is a corner case not tested elsewhere as the * default test zip files have a single entry. */ @Test @Category(NeedsRunner.class) public void testZipCompressedReadWithMultiEntriesFile() throws Exception { String[] entry0 = new String[] { "first", "second", "three" }; String[] entry1 = new String[] { "four", "five", "six" }; String[] entry2 = new String[] { "seven", "eight", "nine" }; List<String> expected = new ArrayList<>(); File file = createZipFile(expected, tempFolder, "multiple entries", entry0, entry1, entry2); assertReadingCompressedFileMatchesExpected(file, ZIP, expected, p); p.run(); } /** * Read a ZIP compressed file containing data, multiple empty entries, and then more data. We * expect just the data back. */ @Test @Category(NeedsRunner.class) public void testZipCompressedReadWithComplexEmptyAndPresentEntries() throws Exception { File file = createZipFile(new ArrayList<String>(), tempFolder, "complex empty and present entries", new String[] { "cat" }, new String[] {}, new String[] {}, new String[] { "dog" }); assertReadingCompressedFileMatchesExpected(file, ZIP, Arrays.asList("cat", "dog"), p); p.run(); } @Test public void testTextIOGetName() { assertEquals("TextIO.Read", TextIO.read().from("somefile").getName()); assertEquals("TextIO.Read", TextIO.read().from("somefile").toString()); } private TextSource prepareSource(byte[] data) throws IOException { return TextIOReadTest.prepareSource(tempFolder, data, null); } @Test public void testProgressEmptyFile() throws IOException { try (BoundedSource.BoundedReader<String> reader = prepareSource(new byte[0]) .createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting. assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Assert empty assertFalse(reader.start()); // Check postconditions after finishing assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testProgressTextFile() throws IOException { String file = "line1\nline2\nline3"; try (BoundedSource.BoundedReader<String> reader = prepareSource(file.getBytes()) .createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 1 assertTrue(reader.start()); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 2 assertTrue(reader.advance()); assertEquals(1, reader.getSplitPointsConsumed()); assertEquals(BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Line 3 assertTrue(reader.advance()); assertEquals(2, reader.getSplitPointsConsumed()); assertEquals(1, reader.getSplitPointsRemaining()); // Check postconditions after finishing assertFalse(reader.advance()); assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(3, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testProgressAfterSplitting() throws IOException { String file = "line1\nline2\nline3"; BoundedSource<String> source = prepareSource(file.getBytes()); BoundedSource<String> remainder; // Create the remainder, verifying properties pre- and post-splitting. try (BoundedSource.BoundedReader<String> readerOrig = source .createReader(PipelineOptionsFactory.create())) { // Preconditions. assertEquals(0.0, readerOrig.getFractionConsumed(), 1e-6); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining()); // First record, before splitting. assertTrue(readerOrig.start()); assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, readerOrig.getSplitPointsRemaining()); // Split. 0.1 is in line1, so should now be able to detect last record. remainder = readerOrig.splitAtFraction(0.1); System.err.println(readerOrig.getCurrentSource()); assertNotNull(remainder); // First record, after splitting. assertEquals(0, readerOrig.getSplitPointsConsumed()); assertEquals(1, readerOrig.getSplitPointsRemaining()); // Finish and postconditions. assertFalse(readerOrig.advance()); assertEquals(1.0, readerOrig.getFractionConsumed(), 1e-6); assertEquals(1, readerOrig.getSplitPointsConsumed()); assertEquals(0, readerOrig.getSplitPointsRemaining()); } // Check the properties of the remainder. try (BoundedSource.BoundedReader<String> reader = remainder .createReader(PipelineOptionsFactory.create())) { // Preconditions. assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // First record should be line 2. assertTrue(reader.start()); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedSource.BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // Second record is line 3 assertTrue(reader.advance()); assertEquals(1, reader.getSplitPointsConsumed()); assertEquals(1, reader.getSplitPointsRemaining()); // Check postconditions after finishing assertFalse(reader.advance()); assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(2, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } } @Test public void testInitialSplitAutoModeTxt() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; File largeTxt = writeToFile(LARGE, tempFolder, "large.txt", UNCOMPRESSED); // Sanity check: file is at least 2 bundles long. assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // At least 2 splits and they are equal to reading the whole file. assertThat(splits, hasSize(greaterThan(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testInitialSplitAutoModeGz() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; File largeGz = writeToFile(LARGE, tempFolder, "large.gz", GZIP); // Sanity check: file is at least 2 bundles long. assertThat(largeGz.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeGz.getPath()).getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // Exactly 1 split, even in AUTO mode, since it is a gzip file. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test public void testInitialSplitGzipModeTxt() throws Exception { PipelineOptions options = TestPipeline.testingPipelineOptions(); long desiredBundleSize = 1000; File largeTxt = writeToFile(LARGE, tempFolder, "large.txt", UNCOMPRESSED); // Sanity check: file is at least 2 bundles long. assertThat(largeTxt.length(), greaterThan(2 * desiredBundleSize)); FileBasedSource<String> source = TextIO.read().from(largeTxt.getPath()).withCompression(GZIP) .getSource(); List<? extends FileBasedSource<String>> splits = source.split(desiredBundleSize, options); // Exactly 1 split, even though splittable text file, since using GZIP mode. assertThat(splits, hasSize(equalTo(1))); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); } @Test @Category(NeedsRunner.class) public void testReadAll() throws IOException { Path tempFolderPath = tempFolder.getRoot().toPath(); writeToFile(TINY, tempFolder, "readAllTiny1.zip", ZIP); writeToFile(TINY, tempFolder, "readAllTiny2.txt", UNCOMPRESSED); writeToFile(LARGE, tempFolder, "readAllLarge1.zip", ZIP); writeToFile(LARGE, tempFolder, "readAllLarge2.txt", UNCOMPRESSED); PCollection<String> lines = p .apply(Create.of(tempFolderPath.resolve("readAllTiny*").toString(), tempFolderPath.resolve("readAllLarge*").toString())) .apply(TextIO.readAll().withCompression(AUTO)); PAssert.that(lines).containsInAnyOrder(Iterables.concat(TINY, TINY, LARGE, LARGE)); p.run(); } @Test @Category(NeedsRunner.class) public void testReadFiles() throws IOException { Path tempFolderPath = tempFolder.getRoot().toPath(); writeToFile(TINY, tempFolder, "readAllTiny1.zip", ZIP); writeToFile(TINY, tempFolder, "readAllTiny2.txt", UNCOMPRESSED); writeToFile(LARGE, tempFolder, "readAllLarge1.zip", ZIP); writeToFile(LARGE, tempFolder, "readAllLarge2.txt", UNCOMPRESSED); PCollection<String> lines = p .apply(Create.of(tempFolderPath.resolve("readAllTiny*").toString(), tempFolderPath.resolve("readAllLarge*").toString())) .apply(FileIO.matchAll()).apply(FileIO.readMatches().withCompression(AUTO)) .apply(TextIO.readFiles().withDesiredBundleSizeBytes(10)); PAssert.that(lines).containsInAnyOrder(Iterables.concat(TINY, TINY, LARGE, LARGE)); p.run(); } @Test @Category({ NeedsRunner.class, UsesSplittableParDo.class }) public void testReadWatchForNewFiles() throws IOException, InterruptedException { final Path basePath = tempFolder.getRoot().toPath().resolve("readWatch"); basePath.toFile().mkdir(); p.apply(GenerateSequence.from(0).to(10).withRate(1, Duration.millis(100))).apply(Window .<Long>into(FixedWindows.of(Duration.millis(150))).withAllowedLateness(Duration.ZERO) .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))).discardingFiredPanes()) .apply(ToString.elements()).apply(TextIO.write().to(basePath.resolve("data").toString()) .withNumShards(1).withWindowedWrites()); PCollection<String> lines = p.apply( TextIO.read().from(basePath.resolve("*").toString()).watchForNewFiles(Duration.millis(100), Watch.Growth.<String>afterTimeSinceNewOutput(Duration.standardSeconds(3)))); PAssert.that(lines).containsInAnyOrder("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); p.run(); } } }