cascading.tap.hadoop.ZipInputFormatTest.java Source code

Introduction

Here is the source code for cascading.tap.hadoop.ZipInputFormatTest.java
Source

/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.tap.hadoop;

import java.io.ByteArrayOutputStream;
import java.io.OutputStream;
import java.util.BitSet;
import java.util.Random;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import cascading.CascadingTestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

public class ZipInputFormatTest extends CascadingTestCase {
    private static final Log LOG = LogFactory.getLog(ZipInputFormatTest.class.getName());

    private static int MAX_LENGTH = 10000;
    private static int MAX_ENTRIES = 100;

    private static Path workDir = new Path("build/test/output/ziptest");

    public void testSplits() throws Exception {
        JobConf job = new JobConf();
        FileSystem currentFs = FileSystem.get(job);

        Path file = new Path(workDir, "test.zip");

        Reporter reporter = Reporter.NULL;

        int seed = new Random().nextInt();
        LOG.info("seed = " + seed);
        Random random = new Random(seed);
        FileInputFormat.setInputPaths(job, file);

        for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) {
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream);
            long length = 0;

            LOG.debug("creating; zip file with entries = " + entries);

            // for each entry in the zip file
            for (int entryCounter = 0; entryCounter < entries; entryCounter++) {
                // construct zip entries splitting MAX_LENGTH between entries
                long entryLength = MAX_LENGTH / entries;
                ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt");
                zipEntry.setMethod(ZipEntry.DEFLATED);
                zos.putNextEntry(zipEntry);

                for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) {
                    zos.write(Long.toString(length).getBytes());
                    zos.write("\n".getBytes());
                }

                zos.flush();
                zos.closeEntry();
            }

            zos.flush();
            zos.close();

            currentFs.delete(file, true);

            OutputStream outputStream = currentFs.create(file);

            byteArrayOutputStream.writeTo(outputStream);
            outputStream.close();

            ZipInputFormat format = new ZipInputFormat();
            format.configure(job);
            LongWritable key = new LongWritable();
            Text value = new Text();
            InputSplit[] splits = format.getSplits(job, 100);

            BitSet bits = new BitSet((int) length);
            for (int j = 0; j < splits.length; j++) {
                LOG.debug("split[" + j + "]= " + splits[j]);
                RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);

                try {
                    int count = 0;

                    while (reader.next(key, value)) {
                        int v = Integer.parseInt(value.toString());
                        LOG.debug("read " + v);

                        if (bits.get(v))
                            LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());

                        assertFalse("key in multiple partitions.", bits.get(v));
                        bits.set(v);
                        count++;
                    }

                    LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
                } finally {
                    reader.close();
                }
            }

            assertEquals("some keys in no partition.", length, bits.cardinality());
        }
    }

}