nl.basjes.hadoop.io.compress.TestSplittableCodecSeams.java Source code

Introduction

Here is the source code for nl.basjes.hadoop.io.compress.TestSplittableCodecSeams.java
Source

/**
 * Making GZip Splittable for Apache Hadoop
 * Copyright (C) 2011-2014 Niels Basjes
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package nl.basjes.hadoop.io.compress;

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.SplitCompressionInputStream;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.ReflectionUtils;
import org.junit.Test;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;

/**
 * Unit tests to see if splitting codecs make their splits accurately.
 * This verifies if the seams between the splits are 100% accurate by comparing
 * all splits with a non-splitted read of the same input.
 */
public class TestSplittableCodecSeams {

    private static final Log LOG = LogFactory.getLog(TestSplittableCodecSeams.class);

    final int BUFFER_SIZE = 4096;

    /**
     * Test with a series of files with several fixed sizes in trailing gibberish.
     * I.e. all lines in the test file are of equal length.
     */
    @Test
    public void testSplittableGzipCodecSeamsFixedLineLengths() {
        for (int length = 1; length <= 15; length += 3) {
            try {
                int splitSize = 10000;
                validateSplitSeamsWithSyntheticFile(SplittableGzipCodec.class, 100000, length, 0, splitSize,
                        2 * splitSize, 1);
            } catch (final IOException e) {
                fail("Exception was thrown: " + e.toString());
            }
        }
    }

    /**
     * Test with a series of files with several varying sizes in trailing
     * gibberish. I.e. there is great variety in the line lengths.
     */
    @Test
    public void testSplittableGzipCodecSeamsRandomLineLengths() {
        try {
            int splitSize = 25000;
            validateSplitSeamsWithSyntheticFile(SplittableGzipCodec.class, 10000, 500, 250, splitSize,
                    2 * splitSize);
        } catch (final IOException e) {
            fail("Exception was thrown: " + e.toString());
        }
    }

    /**
     * Test with a file with several varying split sizes.
     * This was created to push the system into a bad split size for the last split
     * NOTE: At 5011 there used to be a nasty edge case.
     */
    @Test(expected = IllegalArgumentException.class)
    public void testSplittableGzipCodecSeamsVariousSplitSizes() throws IOException {
        for (int splitSize = 5010; splitSize <= 5020; splitSize++) {
            validateSplitSeamsWithSyntheticFile(SplittableGzipCodec.class, 1000, 500, 250, splitSize, 6000);
        }
    }

    /**
     * Test with a file with 1 character lines.
     */
    @Test
    public void testSplittableGzipCodecSeamsSingleCharLines() throws IOException {
        int splitSize = 4096;
        validateSplitSeamsWithSyntheticFile(SplittableGzipCodec.class, 10000000, 1, 0, splitSize, 2 * splitSize,
                100);
    }

    /**
     * Test with a file with a bad split size.
     */
    @Test(expected = IllegalArgumentException.class)
    public void testSplittableGzipCodecSeamsBadSplitSize() throws IOException {
        int splitSize = 2000;
        validateSplitSeamsWithSyntheticFile(SplittableGzipCodec.class, 1000, 500, 250, splitSize, 4096);
    }

    // ------------------------------------------

    private void validateSplitSeamsWithSyntheticFile(final Class<? extends SplittableCompressionCodec> codecClass,
            final long records, final int recordLength, final int recordLengthJitter, final long splitSize,
            final long lastSplitSizeLimit) throws IOException {
        validateSplitSeamsWithSyntheticFile(codecClass, records, recordLength, recordLengthJitter, splitSize,
                lastSplitSizeLimit, 1);
        validateSplitSeamsWithSyntheticFile(codecClass, records, recordLength, recordLengthJitter, splitSize,
                lastSplitSizeLimit, 1000);
    }

    /**
      * This creates a synthetic file and then uses it to run the split seam check.
      */
    private void validateSplitSeamsWithSyntheticFile(final Class<? extends SplittableCompressionCodec> codecClass,
            final long records, final int recordLength, final int recordLengthJitter, final long splitSize,
            final long lastSplitSizeLimit, final int randomizeEveryNChars) throws IOException {
        final Configuration conf = new Configuration();

        if (recordLength + recordLengthJitter > splitSize) {
            fail("Test definition error: Make the splits bigger than the records.");
        }

        if (splitSize > lastSplitSizeLimit) {
            fail("Test definition error: The last split must be the same or larger as the other splits.");
        }

        final FileSystem fs = FileSystem.getLocal(conf);
        final Path filename = writeSplitTestFile(conf, codecClass, records, recordLength, recordLengthJitter,
                randomizeEveryNChars);

        LOG.info("Input is SYNTHETIC: " + "records=" + records + ", " + "recordLength=" + recordLength
                + (recordLengthJitter == 0 ? "" : "+random[0;" + recordLengthJitter + "]."));

        validateSplitSeams(conf, fs, filename, codecClass, splitSize, records, lastSplitSizeLimit);

        fs.delete(filename, true);
    }

    // ------------------------------------------

    /**
     * This test checks if reading the file in a splitted way results
     * in the same lines as reading the file as a single 'split'.
     */
    private void validateSplitSeams(final Configuration conf, final FileSystem fs, final Path filename,
            final Class<? extends SplittableCompressionCodec> codecClass, final long splitSize,
            final long recordsInFile, final long lastSplitSizeLimit) throws IOException {
        // To make the test predictable
        conf.setInt("io.file.buffer.size", BUFFER_SIZE);

        final FileStatus infile = fs.getFileStatus(filename);
        final long inputLength = infile.getLen();

        if (inputLength > Integer.MAX_VALUE) {
            fail("Bad test file length.");
        }

        LOG.info("Input is " + inputLength + " bytes. " + "making a split every " + splitSize + " bytes.");

        if (inputLength <= splitSize) {
            fail("The compressed test file is too small to do any useful testing.");
        }

        final SplittableCompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);

        /*
         * The validation is done as follows:
         * 1) We open the entire file as a single split as the reference
         * 2) We create a sequence of splits and validate each line with the
         *    reference split.
         * The lines from these two must match 100%.
         */

        final Text refLine = new Text();
        final Decompressor refDcmp = CodecPool.getDecompressor(codec);
        assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp);

        final SplitCompressionInputStream refStream = codec.createInputStream(fs.open(infile.getPath()), refDcmp, 0,
                inputLength, SplittableCompressionCodec.READ_MODE.BYBLOCK);
        final LineReader refReader = new LineReader(refStream, conf);

        final Text line = new Text();
        final Decompressor dcmp = CodecPool.getDecompressor(codec);
        assertNotNull("Unable to load the decompressor for codec \"" + codec.getClass().getName() + "\"", refDcmp);

        try {
            long start = 0;
            long end = splitSize;
            int splitCount = 0;
            long refLineNumber = 0;
            long splitLineNumber;

            while (end <= inputLength) {
                splitLineNumber = 0;
                ++splitCount;
                LOG.debug("-------------------------------------------------------");
                dcmp.reset(); // Reset the Decompressor for reuse with the new stream

                final SplitCompressionInputStream splitStream = codec.createInputStream(fs.open(infile.getPath()),
                        dcmp, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

                final long adjustedStart = splitStream.getAdjustedStart();
                final long adjustedEnd = splitStream.getAdjustedEnd();

                if (LOG.isDebugEnabled()) {
                    LOG.debug("Doing split " + splitCount + " on range " + " (" + start + "-" + end + ")"
                            + " adjusted to (" + adjustedStart + "-" + adjustedEnd + ")");
                }

                final LineReader lreader = new LineReader(splitStream, conf);

                if (start != 0) {
                    // Not the first split so we discard the first (incomplete) line.
                    int readChars = lreader.readLine(line);
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("DISCARD LINE " + 0 + " in split " + splitCount + " pos=" + splitStream.getPos()
                                + " length=" + readChars + ": \"" + line + "\"");
                    }
                }

                // Now read until the end of this split
                while (nextKeyValue(splitStream, lreader, adjustedEnd, line)) {
                    ++splitLineNumber;

                    // Get the reference value
                    if (!nextKeyValue(refStream, refReader, inputLength, refLine)) {
                        LOG.error(String.format("S>%05d: %s", splitLineNumber, line));
                        fail("Split goes beyond the end of the reference with line number " + splitLineNumber);
                    }
                    ++refLineNumber;

                    if (LOG.isDebugEnabled() && refLineNumber > (recordsInFile - 10)) {
                        LOG.debug(String.format("R<%05d: %s", refLineNumber, refLine));
                        LOG.debug(String.format("S>%05d: %s", splitLineNumber, line));
                    }

                    assertEquals("Line must be same in reference and in split at line " + refLineNumber, refLine,
                            line);

                    if (LOG.isTraceEnabled()) {
                        LOG.trace("LINE " + splitLineNumber + " in split " + splitCount + " (" + refLineNumber
                                + ") pos=" + splitStream.getPos() + " length=" + line.getLength() + ": \"" + line
                                + "\"");
                    }
                }

                // We just read through the entire split
                LOG.debug("Checked split " + splitCount + " (" + adjustedStart + "-" + adjustedEnd + ") "
                        + "containing " + splitLineNumber + " lines.");

                if (end == inputLength) {
                    LOG.info("====================> Finished the last split <====================");
                    break; // We've reached the end of the last split
                }

                // Determine start and end for the next split
                start = end;

                if ((end + lastSplitSizeLimit) > inputLength) {
                    end = inputLength;
                    LOG.info("====================> Starting the last split (" + start + " - " + end
                            + ") <====================");
                } else {
                    end += splitSize;
                    LOG.info("====================> Starting the next split (" + start + " - " + end
                            + ") <====================");
                }

            }

            if (nextKeyValue(refStream, refReader, inputLength, refLine)) {
                ++refLineNumber;
                LOG.error(String.format("R<%05d: %s", refLineNumber, refLine));
                fail("The reference is at least one line longer than the last split ( " + "splitSize=" + splitSize
                        + ", " + "inputLength= " + inputLength + ", " + "split start=" + start + ", " + "split end="
                        + end + ", " + "line=" + refLineNumber + ")");
            }

            LOG.info("Verified " + refLineNumber + " lines in " + splitCount + " splits.");

        } finally {
            CodecPool.returnDecompressor(dcmp);
            CodecPool.returnDecompressor(refDcmp);
        }
    }

    // ------------------------------------------

    /**
     * Mostly copied from LineRecordReader (MapReduce) to pull an example of
     * actual usage into this test.
     */
    public boolean nextKeyValue(final SplitCompressionInputStream in, final LineReader lr, final long end,
            Text value) throws IOException {
        final int maxLineLength = Integer.MAX_VALUE;
        if (value == null) {
            value = new Text();
        }
        int newSize = 0;
        // We always read one extra line, which lies outside the upper
        // split limit i.e. (end - 1)
        while (in.getPos() <= end) {
            newSize = lr.readLine(value, maxLineLength, maxLineLength);
            if (newSize == 0) {
                break;
            }
            if (newSize < maxLineLength) {
                break;
            }

            // line too long. try again
            LOG.info("Skipped line of size " + newSize + " at pos " + (in.getPos() - newSize));
        }
        if (newSize == 0) {
            value = null;
            return false;
        } else {
            return true;
        }
    }

    // ------------------------------------------

    /**
     * Write the specified number of records to file in test dir using codec.
     * Records are simply lines random ASCII
     */
    private static Path writeSplitTestFile(final Configuration conf,
            final Class<? extends SplittableCompressionCodec> codecClass, final long records,
            final int recordLength, final int trailingSizeJitter, final int randomizeEveryNChars)
            throws IOException {

        RAND.setSeed(1); // Make the tests better reproducable

        final FileSystem fs = FileSystem.getLocal(conf);
        final SplittableCompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);

        final Path wd = new Path(new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(fs.getUri(),
                fs.getWorkingDirectory()), codec.getClass().getSimpleName());

        final Path file = new Path(wd,
                "test-" + records + "-" + recordLength + "-" + trailingSizeJitter + codec.getDefaultExtension());
        DataOutputStream out = null;
        final Compressor cmp = CodecPool.getCompressor(codec);
        try {
            out = new DataOutputStream(codec.createOutputStream(fs.create(file, true), cmp));

            for (long seq = 1; seq <= records; ++seq) {
                final String line = randomGibberish(
                        recordLength + (trailingSizeJitter > 0 ? RAND.nextInt(trailingSizeJitter) : 0),
                        randomizeEveryNChars) + "\n";
                // There must be a simpler way to output ACSII instead of 2 byte UNICODE
                out.writeBytes(new String(line.getBytes("UTF-8"), "US-ASCII"));
            }
        } finally {
            IOUtils.cleanup(LOG, out);
            CodecPool.returnCompressor(cmp);
        }
        return file;
    }

    // ----------------------------------------------

    // Fixed the seed to make the test results reproducible
    private static final Random RAND = new Random(1);
    private static final char[] LETTERS = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
            'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
            'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3',
            '4', '5', '6', '7', '8', '9', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')' };

    private static long charsWritten = 0;
    private static char nextChar = 'a';

    private static String randomGibberish(final int length, final int randomizeEveryNChars) {
        if (length == 0) {
            return "";
        }
        final StringBuilder sb = new StringBuilder(length);
        for (int i = 0; i < length; i++) {
            if (charsWritten % randomizeEveryNChars == 0) {
                nextChar = LETTERS[RAND.nextInt(LETTERS.length)];
            }
            charsWritten++;
            sb.append(nextChar);
        }
        return sb.toString();
    }
}