org.apache.beam.sdk.io.FileBasedSinkTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.beam.sdk.io.FileBasedSinkTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import static org.hamcrest.Matchers.is;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import com.google.common.collect.Lists;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import org.apache.beam.sdk.io.FileBasedSink.CompressionType;
import org.apache.beam.sdk.io.FileBasedSink.FileResult;
import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy;
import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy.Context;
import org.apache.beam.sdk.io.FileBasedSink.WritableByteChannelFactory;
import org.apache.beam.sdk.io.FileBasedSink.WriteOperation;
import org.apache.beam.sdk.io.FileBasedSink.Writer;
import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

/**
 * Tests for {@link FileBasedSink}.
 */
@RunWith(JUnit4.class)
public class FileBasedSinkTest {
    @Rule
    public TemporaryFolder tmpFolder = new TemporaryFolder();

    private final String tempDirectoryName = "temp";

    private ResourceId getTemporaryFolder() {
        return LocalResources.fromFile(tmpFolder.getRoot(), /* isDirectory */ true);
    }

    private ResourceId getBaseOutputDirectory() {
        String baseOutputDirname = "output";
        return getTemporaryFolder().resolve(baseOutputDirname, StandardResolveOptions.RESOLVE_DIRECTORY);
    }

    private ResourceId getBaseTempDirectory() {
        return getTemporaryFolder().resolve(tempDirectoryName, StandardResolveOptions.RESOLVE_DIRECTORY);
    }

    /**
     * Writer opens the correct file, writes the header, footer, and elements in the correct
     * order, and returns the correct filename.
     */
    @Test
    public void testWriter() throws Exception {
        String testUid = "testId";
        ResourceId expectedTempFile = getBaseTempDirectory().resolve(testUid, StandardResolveOptions.RESOLVE_FILE);
        List<String> values = Arrays.asList("sympathetic vulture", "boresome hummingbird");
        List<String> expected = new ArrayList<>();
        expected.add(SimpleSink.SimpleWriter.HEADER);
        expected.addAll(values);
        expected.add(SimpleSink.SimpleWriter.FOOTER);

        SimpleSink.SimpleWriter writer = buildWriteOperationWithTempDir(getBaseTempDirectory()).createWriter();
        writer.openUnwindowed(testUid, -1);
        for (String value : values) {
            writer.write(value);
        }
        FileResult result = writer.close();

        FileBasedSink sink = writer.getWriteOperation().getSink();
        assertEquals(expectedTempFile, result.getTempFilename());
        assertFileContains(expected, expectedTempFile);
    }

    /**
     * Assert that a file contains the lines provided, in the same order as expected.
     */
    private void assertFileContains(List<String> expected, ResourceId file) throws Exception {
        try (BufferedReader reader = new BufferedReader(new FileReader(file.toString()))) {
            List<String> actual = new ArrayList<>();
            for (;;) {
                String line = reader.readLine();
                if (line == null) {
                    break;
                }
                actual.add(line);
            }
            assertEquals("contents for " + file, expected, actual);
        }
    }

    /** Write lines to a file. */
    private void writeFile(List<String> lines, File file) throws Exception {
        try (PrintWriter writer = new PrintWriter(new FileOutputStream(file))) {
            for (String line : lines) {
                writer.println(line);
            }
        }
    }

    /**
     * Removes temporary files when temporary and output directories differ.
     */
    @Test
    public void testRemoveWithTempFilename() throws Exception {
        testRemoveTemporaryFiles(3, getBaseTempDirectory());
    }

    /** Finalize copies temporary files to output files and removes any temporary files. */
    @Test
    public void testFinalize() throws Exception {
        List<File> files = generateTemporaryFilesForFinalize(3);
        runFinalize(buildWriteOperation(), files);
    }

    /** Finalize can be called repeatedly. */
    @Test
    public void testFinalizeMultipleCalls() throws Exception {
        List<File> files = generateTemporaryFilesForFinalize(3);
        SimpleSink.SimpleWriteOperation writeOp = buildWriteOperation();
        runFinalize(writeOp, files);
        runFinalize(writeOp, files);
    }

    /** Finalize can be called when some temporary files do not exist and output files exist. */
    @Test
    public void testFinalizeWithIntermediateState() throws Exception {
        SimpleSink.SimpleWriteOperation writeOp = buildWriteOperation();
        List<File> files = generateTemporaryFilesForFinalize(3);
        runFinalize(writeOp, files);

        // create a temporary file and then rerun finalize
        tmpFolder.newFolder(tempDirectoryName);
        tmpFolder.newFile(tempDirectoryName + "/1");

        runFinalize(writeOp, files);
    }

    /** Generate n temporary files using the temporary file pattern of Writer. */
    private List<File> generateTemporaryFilesForFinalize(int numFiles) throws Exception {
        List<File> temporaryFiles = new ArrayList<>();
        for (int i = 0; i < numFiles; i++) {
            ResourceId temporaryFile = WriteOperation.buildTemporaryFilename(getBaseTempDirectory(), "" + i);
            File tmpFile = new File(tmpFolder.getRoot(), temporaryFile.toString());
            tmpFile.getParentFile().mkdirs();
            assertTrue(tmpFile.createNewFile());
            temporaryFiles.add(tmpFile);
        }

        return temporaryFiles;
    }

    /** Finalize and verify that files are copied and temporary files are optionally removed. */
    private void runFinalize(SimpleSink.SimpleWriteOperation writeOp, List<File> temporaryFiles) throws Exception {
        int numFiles = temporaryFiles.size();

        List<FileResult> fileResults = new ArrayList<>();
        // Create temporary output bundles and output File objects.
        for (int i = 0; i < numFiles; i++) {
            fileResults.add(new FileResult(LocalResources.fromFile(temporaryFiles.get(i), false),
                    WriteFiles.UNKNOWN_SHARDNUM, null, null));
        }

        writeOp.finalize(fileResults);

        ResourceId outputDirectory = writeOp.getSink().getBaseOutputDirectoryProvider().get();
        for (int i = 0; i < numFiles; i++) {
            ResourceId outputFilename = writeOp.getSink().getFilenamePolicy().unwindowedFilename(outputDirectory,
                    new Context(i, numFiles), "");
            assertTrue(new File(outputFilename.toString()).exists());
            assertFalse(temporaryFiles.get(i).exists());
        }

        assertFalse(new File(writeOp.tempDirectory.get().toString()).exists());
        // Test that repeated requests of the temp directory return a stable result.
        assertEquals(writeOp.tempDirectory.get(), writeOp.tempDirectory.get());
    }

    /**
     * Create n temporary and output files and verify that removeTemporaryFiles only removes temporary
     * files.
     */
    private void testRemoveTemporaryFiles(int numFiles, ResourceId tempDirectory) throws Exception {
        String prefix = "file";
        SimpleSink sink = new SimpleSink(getBaseOutputDirectory(), prefix, "", "");

        WriteOperation<String> writeOp = new SimpleSink.SimpleWriteOperation(sink, tempDirectory);

        List<File> temporaryFiles = new ArrayList<>();
        List<File> outputFiles = new ArrayList<>();
        for (int i = 0; i < numFiles; i++) {
            ResourceId tempResource = WriteOperation.buildTemporaryFilename(tempDirectory, prefix + i);
            File tmpFile = new File(tempResource.toString());
            tmpFile.getParentFile().mkdirs();
            assertTrue("not able to create new temp file", tmpFile.createNewFile());
            temporaryFiles.add(tmpFile);
            ResourceId outputFileId = getBaseOutputDirectory().resolve(prefix + i,
                    StandardResolveOptions.RESOLVE_FILE);
            File outputFile = new File(outputFileId.toString());
            outputFile.getParentFile().mkdirs();
            assertTrue("not able to create new output file", outputFile.createNewFile());
            outputFiles.add(outputFile);
        }

        writeOp.removeTemporaryFiles(Collections.<ResourceId>emptySet(), true);

        for (int i = 0; i < numFiles; i++) {
            File temporaryFile = temporaryFiles.get(i);
            assertThat(String.format("temp file %s exists", temporaryFile), temporaryFile.exists(), is(false));
            File outputFile = outputFiles.get(i);
            assertThat(String.format("output file %s exists", outputFile), outputFile.exists(), is(true));
        }
    }

    /** Output files are copied to the destination location with the correct names and contents. */
    @Test
    public void testCopyToOutputFiles() throws Exception {
        SimpleSink.SimpleWriteOperation writeOp = buildWriteOperation();
        ResourceId outputDirectory = writeOp.getSink().getBaseOutputDirectoryProvider().get();

        List<String> inputFilenames = Arrays.asList("input-1", "input-2", "input-3");
        List<String> inputContents = Arrays.asList("1", "2", "3");
        List<String> expectedOutputFilenames = Arrays.asList("file-00-of-03.test", "file-01-of-03.test",
                "file-02-of-03.test");

        Map<ResourceId, ResourceId> inputFilePaths = new HashMap<>();
        List<ResourceId> expectedOutputPaths = new ArrayList<>();

        for (int i = 0; i < inputFilenames.size(); i++) {
            // Generate output paths.
            expectedOutputPaths.add(getBaseOutputDirectory().resolve(expectedOutputFilenames.get(i),
                    StandardResolveOptions.RESOLVE_FILE));

            // Generate and write to input paths.
            File inputTmpFile = tmpFolder.newFile(inputFilenames.get(i));
            List<String> lines = Collections.singletonList(inputContents.get(i));
            writeFile(lines, inputTmpFile);
            inputFilePaths.put(LocalResources.fromFile(inputTmpFile, false), writeOp.getSink().getFilenamePolicy()
                    .unwindowedFilename(outputDirectory, new Context(i, inputFilenames.size()), ""));
        }

        // Copy input files to output files.
        writeOp.copyToOutputFiles(inputFilePaths);

        // Assert that the contents were copied.
        for (int i = 0; i < expectedOutputPaths.size(); i++) {
            assertFileContains(Collections.singletonList(inputContents.get(i)), expectedOutputPaths.get(i));
        }
    }

    public List<ResourceId> generateDestinationFilenames(ResourceId outputDirectory, FilenamePolicy policy,
            int numFiles) {
        List<ResourceId> filenames = new ArrayList<>();
        for (int i = 0; i < numFiles; i++) {
            filenames.add(policy.unwindowedFilename(outputDirectory, new Context(i, numFiles), ""));
        }
        return filenames;
    }

    /**
     * Output filenames are generated correctly when an extension is supplied.
     */

    @Test
    public void testGenerateOutputFilenames() {
        List<ResourceId> expected;
        List<ResourceId> actual;
        ResourceId root = getBaseOutputDirectory();

        SimpleSink sink = new SimpleSink(root, "file", ".SSSSS.of.NNNNN", ".test");
        FilenamePolicy policy = sink.getFilenamePolicy();

        expected = Arrays.asList(root.resolve("file.00000.of.00003.test", StandardResolveOptions.RESOLVE_FILE),
                root.resolve("file.00001.of.00003.test", StandardResolveOptions.RESOLVE_FILE),
                root.resolve("file.00002.of.00003.test", StandardResolveOptions.RESOLVE_FILE));
        actual = generateDestinationFilenames(root, policy, 3);
        assertEquals(expected, actual);

        expected = Collections
                .singletonList(root.resolve("file.00000.of.00001.test", StandardResolveOptions.RESOLVE_FILE));
        actual = generateDestinationFilenames(root, policy, 1);
        assertEquals(expected, actual);

        expected = new ArrayList<>();
        actual = generateDestinationFilenames(root, policy, 0);
        assertEquals(expected, actual);
    }

    /** Reject non-distinct output filenames. */
    @Test
    public void testCollidingOutputFilenames() throws IOException {
        ResourceId root = getBaseOutputDirectory();
        SimpleSink sink = new SimpleSink(root, "file", "-NN", "test");
        SimpleSink.SimpleWriteOperation writeOp = new SimpleSink.SimpleWriteOperation(sink);

        ResourceId temp1 = root.resolve("temp1", StandardResolveOptions.RESOLVE_FILE);
        ResourceId temp2 = root.resolve("temp2", StandardResolveOptions.RESOLVE_FILE);
        ResourceId temp3 = root.resolve("temp3", StandardResolveOptions.RESOLVE_FILE);
        ResourceId output = root.resolve("file-03.test", StandardResolveOptions.RESOLVE_FILE);
        // More than one shard does.
        try {
            Iterable<FileResult> results = Lists.newArrayList(new FileResult(temp1, 1, null, null),
                    new FileResult(temp2, 1, null, null), new FileResult(temp3, 1, null, null));
            writeOp.buildOutputFilenames(results);
            fail("Should have failed.");
        } catch (IllegalStateException exn) {
            assertEquals("Only generated 1 distinct file names for 3 files.", exn.getMessage());
        }
    }

    /** Output filenames are generated correctly when an extension is not supplied. */
    @Test
    public void testGenerateOutputFilenamesWithoutExtension() {
        List<ResourceId> expected;
        List<ResourceId> actual;
        ResourceId root = getBaseOutputDirectory();
        SimpleSink sink = new SimpleSink(root, "file", "-SSSSS-of-NNNNN", "");
        FilenamePolicy policy = sink.getFilenamePolicy();

        expected = Arrays.asList(root.resolve("file-00000-of-00003", StandardResolveOptions.RESOLVE_FILE),
                root.resolve("file-00001-of-00003", StandardResolveOptions.RESOLVE_FILE),
                root.resolve("file-00002-of-00003", StandardResolveOptions.RESOLVE_FILE));
        actual = generateDestinationFilenames(root, policy, 3);
        assertEquals(expected, actual);

        expected = Collections
                .singletonList(root.resolve("file-00000-of-00001", StandardResolveOptions.RESOLVE_FILE));
        actual = generateDestinationFilenames(root, policy, 1);
        assertEquals(expected, actual);

        expected = new ArrayList<>();
        actual = generateDestinationFilenames(root, policy, 0);
        assertEquals(expected, actual);
    }

    /** {@link CompressionType#BZIP2} correctly writes BZip2 data. */
    @Test
    public void testCompressionTypeBZIP2() throws FileNotFoundException, IOException {
        final File file = writeValuesWithWritableByteChannelFactory(CompressionType.BZIP2, "abc", "123");
        // Read Bzip2ed data back in using Apache commons API (de facto standard).
        assertReadValues(
                new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(new FileInputStream(file)),
                        StandardCharsets.UTF_8.name())),
                "abc", "123");
    }

    /** {@link CompressionType#GZIP} correctly writes Gzipped data. */
    @Test
    public void testCompressionTypeGZIP() throws FileNotFoundException, IOException {
        final File file = writeValuesWithWritableByteChannelFactory(CompressionType.GZIP, "abc", "123");
        // Read Gzipped data back in using standard API.
        assertReadValues(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)),
                StandardCharsets.UTF_8.name())), "abc", "123");
    }

    /** {@link CompressionType#DEFLATE} correctly writes deflate data. */
    @Test
    public void testCompressionTypeDEFLATE() throws FileNotFoundException, IOException {
        final File file = writeValuesWithWritableByteChannelFactory(CompressionType.DEFLATE, "abc", "123");
        // Read Gzipped data back in using standard API.
        assertReadValues(new BufferedReader(new InputStreamReader(
                new DeflateCompressorInputStream(new FileInputStream(file)), StandardCharsets.UTF_8.name())), "abc",
                "123");
    }

    /** {@link CompressionType#UNCOMPRESSED} correctly writes uncompressed data. */
    @Test
    public void testCompressionTypeUNCOMPRESSED() throws FileNotFoundException, IOException {
        final File file = writeValuesWithWritableByteChannelFactory(CompressionType.UNCOMPRESSED, "abc", "123");
        // Read uncompressed data back in using standard API.
        assertReadValues(
                new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8.name())),
                "abc", "123");
    }

    private void assertReadValues(final BufferedReader br, String... values) throws IOException {
        try (final BufferedReader _br = br) {
            for (String value : values) {
                assertEquals(String.format("Line should read '%s'", value), value, _br.readLine());
            }
        }
    }

    private File writeValuesWithWritableByteChannelFactory(final WritableByteChannelFactory factory,
            String... values) throws IOException {
        final File file = tmpFolder.newFile("test.gz");
        final WritableByteChannel channel = factory.create(Channels.newChannel(new FileOutputStream(file)));
        for (String value : values) {
            channel.write(ByteBuffer.wrap((value + "\n").getBytes(StandardCharsets.UTF_8)));
        }
        channel.close();
        return file;
    }

    /**
     * {@link Writer} writes to the {@link WritableByteChannel} provided by {@link
     * DrunkWritableByteChannelFactory}.
     */
    @Test
    public void testFileBasedWriterWithWritableByteChannelFactory() throws Exception {
        final String testUid = "testId";
        ResourceId root = getBaseOutputDirectory();
        WriteOperation<String> writeOp = new SimpleSink(root, "file", "-SS-of-NN", "txt",
                new DrunkWritableByteChannelFactory()).createWriteOperation();
        final Writer<String> writer = writeOp.createWriter();
        final ResourceId expectedFile = writeOp.tempDirectory.get().resolve(testUid,
                StandardResolveOptions.RESOLVE_FILE);

        final List<String> expected = new ArrayList<>();
        expected.add("header");
        expected.add("header");
        expected.add("a");
        expected.add("a");
        expected.add("b");
        expected.add("b");
        expected.add("footer");
        expected.add("footer");

        writer.openUnwindowed(testUid, -1);
        writer.write("a");
        writer.write("b");
        final FileResult result = writer.close();

        assertEquals(expectedFile, result.getTempFilename());
        assertFileContains(expected, expectedFile);
    }

    /** Build a SimpleSink with default options. */
    private SimpleSink buildSink() {
        return new SimpleSink(getBaseOutputDirectory(), "file", "-SS-of-NN", ".test");
    }

    /**
     * Build a SimpleWriteOperation with default options and the given temporary directory.
     */
    private SimpleSink.SimpleWriteOperation buildWriteOperationWithTempDir(ResourceId tempDirectory) {
        SimpleSink sink = buildSink();
        return new SimpleSink.SimpleWriteOperation(sink, tempDirectory);
    }

    /** Build a write operation with the default options for it and its parent sink. */
    private SimpleSink.SimpleWriteOperation buildWriteOperation() {
        return buildSink().createWriteOperation();
    }
}