Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap.hadoop; import java.io.ByteArrayOutputStream; import java.io.OutputStream; import java.util.BitSet; import java.util.Random; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import cascading.CascadingTestCase; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; public class ZipInputFormatTest extends CascadingTestCase { private static final Log LOG = LogFactory.getLog(ZipInputFormatTest.class.getName()); private static int MAX_LENGTH = 10000; private static int MAX_ENTRIES = 100; private static Path workDir = new Path("build/test/output/ziptest"); public void testSplits() throws Exception { JobConf job = new JobConf(); FileSystem currentFs = FileSystem.get(job); Path file = new Path(workDir, "test.zip"); Reporter reporter = Reporter.NULL; int seed = new Random().nextInt(); LOG.info("seed = " + seed); Random random = new Random(seed); FileInputFormat.setInputPaths(job, file); for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) { ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream); long length = 0; LOG.debug("creating; zip file with entries = " + entries); // for each entry in the zip file for (int entryCounter = 0; entryCounter < entries; entryCounter++) { // construct zip entries splitting MAX_LENGTH between entries long entryLength = MAX_LENGTH / entries; ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt"); zipEntry.setMethod(ZipEntry.DEFLATED); zos.putNextEntry(zipEntry); for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) { zos.write(Long.toString(length).getBytes()); zos.write("\n".getBytes()); } zos.flush(); zos.closeEntry(); } zos.flush(); zos.close(); currentFs.delete(file, true); OutputStream outputStream = currentFs.create(file); byteArrayOutputStream.writeTo(outputStream); outputStream.close(); ZipInputFormat format = new ZipInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); InputSplit[] splits = format.getSplits(job, 100); BitSet bits = new BitSet((int) length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos()); assertFalse("key in multiple partitions.", bits.get(v)); bits.set(v); count++; } LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.close(); } } assertEquals("some keys in no partition.", length, bits.cardinality()); } } }