org.apache.flink.api.common.io.DelimitedInputFormatTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flink.api.common.io.DelimitedInputFormatTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.common.io;

import org.apache.commons.lang3.StringUtils;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.FileInputSplit;
import org.apache.flink.core.fs.Path;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

public class DelimitedInputFormatTest {

    private DelimitedInputFormat<String> format;

    // --------------------------------------------------------------------------------------------

    @Before
    public void setup() {
        format = new MyTextInputFormat();
        this.format.setFilePath(new Path("file:///some/file/that/will/not/be/read"));
    }

    @After
    public void shutdown() throws Exception {
        if (this.format != null) {
            this.format.close();
        }
    }

    // --------------------------------------------------------------------------------------------
    // --------------------------------------------------------------------------------------------
    @Test
    public void testConfigure() {
        Configuration cfg = new Configuration();
        cfg.setString("delimited-format.delimiter", "\n");

        format.configure(cfg);
        assertEquals("\n", new String(format.getDelimiter(), format.getCharset()));

        cfg.setString("delimited-format.delimiter", "&-&");
        format.configure(cfg);
        assertEquals("&-&", new String(format.getDelimiter(), format.getCharset()));
    }

    @Test
    public void testSerialization() throws Exception {
        final byte[] DELIMITER = new byte[] { 1, 2, 3, 4 };
        final int NUM_LINE_SAMPLES = 7;
        final int LINE_LENGTH_LIMIT = 12345;
        final int BUFFER_SIZE = 178;

        DelimitedInputFormat<String> format = new MyTextInputFormat();
        format.setDelimiter(DELIMITER);
        format.setNumLineSamples(NUM_LINE_SAMPLES);
        format.setLineLengthLimit(LINE_LENGTH_LIMIT);
        format.setBufferSize(BUFFER_SIZE);

        ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
        ObjectOutputStream oos = new ObjectOutputStream(baos);
        oos.writeObject(format);
        oos.flush();
        oos.close();

        ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray()));
        @SuppressWarnings("unchecked")
        DelimitedInputFormat<String> deserialized = (DelimitedInputFormat<String>) ois.readObject();

        assertEquals(NUM_LINE_SAMPLES, deserialized.getNumLineSamples());
        assertEquals(LINE_LENGTH_LIMIT, deserialized.getLineLengthLimit());
        assertEquals(BUFFER_SIZE, deserialized.getBufferSize());
        assertArrayEquals(DELIMITER, deserialized.getDelimiter());
    }

    @Test
    public void testOpen() throws IOException {
        final String myString = "my mocked line 1\nmy mocked line 2\n";
        final FileInputSplit split = createTempFile(myString);

        int bufferSize = 5;
        format.setBufferSize(bufferSize);
        format.open(split);
        assertEquals(0, format.splitStart);
        assertEquals(myString.length() - bufferSize, format.splitLength);
        assertEquals(bufferSize, format.getBufferSize());
    }

    @Test
    public void testReadWithoutTrailingDelimiter() throws IOException {
        // 2. test case
        final String myString = "my key|my val$$$my key2\n$$ctd.$$|my value2";
        final FileInputSplit split = createTempFile(myString);

        final Configuration parameters = new Configuration();
        // default delimiter = '\n'

        format.configure(parameters);
        format.open(split);

        String first = format.nextRecord(null);
        String second = format.nextRecord(null);

        assertNotNull(first);
        assertNotNull(second);

        assertEquals("my key|my val$$$my key2", first);
        assertEquals("$$ctd.$$|my value2", second);

        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());
    }

    @Test
    public void testReadWithTrailingDelimiter() throws IOException {
        // 2. test case
        final String myString = "my key|my val$$$my key2\n$$ctd.$$|my value2\n";
        final FileInputSplit split = createTempFile(myString);

        final Configuration parameters = new Configuration();
        // default delimiter = '\n'

        format.configure(parameters);
        format.open(split);

        String first = format.nextRecord(null);
        String second = format.nextRecord(null);

        assertNotNull(first);
        assertNotNull(second);

        assertEquals("my key|my val$$$my key2", first);
        assertEquals("$$ctd.$$|my value2", second);

        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());
    }

    @Test
    public void testReadCustomDelimiter() throws IOException {
        final String myString = "my key|my val$$$my key2\n$$ctd.$$|my value2";
        final FileInputSplit split = createTempFile(myString);

        final Configuration parameters = new Configuration();

        format.setDelimiter("$$$");
        format.configure(parameters);
        format.open(split);

        String first = format.nextRecord(null);
        assertNotNull(first);
        assertEquals("my key|my val", first);

        String second = format.nextRecord(null);
        assertNotNull(second);
        assertEquals("my key2\n$$ctd.$$|my value2", second);

        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());
    }

    @Test
    public void testMultiCharDelimiter() throws IOException {
        final String myString = "www112xx1123yyy11123zzzzz1123";
        final FileInputSplit split = createTempFile(myString);

        final Configuration parameters = new Configuration();

        format.setDelimiter("1123");
        format.configure(parameters);
        format.open(split);

        String first = format.nextRecord(null);
        assertNotNull(first);
        assertEquals("www112xx", first);

        String second = format.nextRecord(null);
        assertNotNull(second);
        assertEquals("yyy1", second);

        String third = format.nextRecord(null);
        assertNotNull(third);
        assertEquals("zzzzz", third);

        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());
    }

    @Test
    public void testReadCustomDelimiterWithCharset() throws IOException {
        // Unicode row fragments
        String[] records = new String[] { "\u020e\u021f\u05c0\u020b\u020f", "Apache", "\nFlink", "\u0000",
                "\u05c0" };

        // Unicode delimiter
        String delimiter = "\u05c0\u05c0";

        String fileContent = StringUtils.join(records, delimiter);

        for (final String charset : new String[] { "UTF-8", "UTF-16BE", "UTF-16LE" }) {
            // use charset when instantiating the record String
            DelimitedInputFormat<String> format = new DelimitedInputFormat<String>() {
                @Override
                public String readRecord(String reuse, byte[] bytes, int offset, int numBytes) throws IOException {
                    return new String(bytes, offset, numBytes, charset);
                }
            };
            format.setFilePath("file:///some/file/that/will/not/be/read");

            final FileInputSplit split = createTempFile(fileContent, charset);

            format.setDelimiter(delimiter);
            // use the same encoding to parse the file as used to read the file;
            // the delimiter is reinterpreted when the charset is set
            format.setCharset(charset);
            format.configure(new Configuration());
            format.open(split);

            for (String record : records) {
                String value = format.nextRecord(null);
                assertEquals(record, value);
            }

            assertNull(format.nextRecord(null));
            assertTrue(format.reachedEnd());
        }
    }

    /**
     * Tests that the records are read correctly when the split boundary is in the middle of a record.
     */
    @Test
    public void testReadOverSplitBoundariesUnaligned() throws IOException {
        final String myString = "value1\nvalue2\nvalue3";
        final FileInputSplit split = createTempFile(myString);

        FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2,
                split.getHostnames());
        FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(),
                split.getHostnames());

        final Configuration parameters = new Configuration();

        format.configure(parameters);
        format.open(split1);

        assertEquals("value1", format.nextRecord(null));
        assertEquals("value2", format.nextRecord(null));
        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());

        format.close();
        format.open(split2);

        assertEquals("value3", format.nextRecord(null));
        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());

        format.close();
    }

    /**
     * Tests that the correct number of records is read when the split boundary is exact at the record boundary.
     */
    @Test
    public void testReadWithBufferSizeIsMultiple() throws IOException {
        final String myString = "aaaaaaa\nbbbbbbb\nccccccc\nddddddd\n";
        final FileInputSplit split = createTempFile(myString);

        FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2,
                split.getHostnames());
        FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(),
                split.getHostnames());

        final Configuration parameters = new Configuration();

        format.setBufferSize(2 * ((int) split1.getLength()));
        format.configure(parameters);

        String next;
        int count = 0;

        // read split 1
        format.open(split1);
        while ((next = format.nextRecord(null)) != null) {
            assertEquals(7, next.length());
            count++;
        }
        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());
        format.close();

        // this one must have read one too many, because the next split will skipp the trailing remainder
        // which happens to be one full record
        assertEquals(3, count);

        // read split 2
        format.open(split2);
        while ((next = format.nextRecord(null)) != null) {
            assertEquals(7, next.length());
            count++;
        }
        format.close();

        assertEquals(4, count);
    }

    @Test
    public void testReadExactlyBufferSize() throws IOException {
        final String myString = "aaaaaaa\nbbbbbbb\nccccccc\nddddddd\n";

        final FileInputSplit split = createTempFile(myString);
        final Configuration parameters = new Configuration();

        format.setBufferSize((int) split.getLength());
        format.configure(parameters);
        format.open(split);

        String next;
        int count = 0;
        while ((next = format.nextRecord(null)) != null) {
            assertEquals(7, next.length());
            count++;
        }
        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());

        format.close();

        assertEquals(4, count);
    }

    @Test
    public void testReadRecordsLargerThanBuffer() throws IOException {
        final String myString = "aaaaaaaaaaaaaaaaaaaaa\n" + "bbbbbbbbbbbbbbbbbbbbbbbbb\n" + "ccccccccccccccccccc\n"
                + "ddddddddddddddddddddddddddddddddddd\n";

        final FileInputSplit split = createTempFile(myString);
        FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2,
                split.getHostnames());
        FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(),
                split.getHostnames());

        final Configuration parameters = new Configuration();

        format.setBufferSize(8);
        format.configure(parameters);

        String next;
        List<String> result = new ArrayList<String>();

        format.open(split1);
        while ((next = format.nextRecord(null)) != null) {
            result.add(next);
        }
        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());
        format.close();

        format.open(split2);
        while ((next = format.nextRecord(null)) != null) {
            result.add(next);
        }
        assertNull(format.nextRecord(null));
        assertTrue(format.reachedEnd());
        format.close();

        assertEquals(4, result.size());
        assertEquals(Arrays.asList(myString.split("\n")), result);
    }

    static FileInputSplit createTempFile(String contents) throws IOException {
        File tempFile = File.createTempFile("test_contents", "tmp");
        tempFile.deleteOnExit();

        try (Writer out = new OutputStreamWriter(new FileOutputStream(tempFile))) {
            out.write(contents);
        }

        return new FileInputSplit(0, new Path(tempFile.toURI().toString()), 0, tempFile.length(),
                new String[] { "localhost" });
    }

    static FileInputSplit createTempFile(String contents, String charset) throws IOException {
        File tempFile = File.createTempFile("test_contents", "tmp");
        tempFile.deleteOnExit();

        try (Writer out = new OutputStreamWriter(new FileOutputStream(tempFile), charset)) {
            out.write(contents);
        }

        return new FileInputSplit(0, new Path(tempFile.toURI().toString()), 0, tempFile.length(),
                new String[] { "localhost" });
    }

    protected static final class MyTextInputFormat extends DelimitedInputFormat<String> {
        private static final long serialVersionUID = 1L;

        @Override
        public String readRecord(String reuse, byte[] bytes, int offset, int numBytes) {
            return new String(bytes, offset, numBytes, ConfigConstants.DEFAULT_CHARSET);
        }
    }
}