Java tutorial
/* * This file is part of Hadoop-Gpl-Compression. * * Hadoop-Gpl-Compression is free software: you can redistribute it * and/or modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * Hadoop-Gpl-Compression is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Hadoop-Gpl-Compression. If not, see * <http://www.gnu.org/licenses/>. */ package com.hadoop.mapreduce; import java.io.IOException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Arrays; import java.util.List; import java.util.Random; import junit.framework.TestCase; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import com.hadoop.compression.lzo.GPLNativeCodeLoader; import com.hadoop.compression.lzo.LzopCodec; import com.hadoop.mapreduce.LzoTextInputFormat.LzoIndex; /** * Test the LzoTextInputFormat, make sure it splits the file properly and * returns the right data. */ public class TestLzoTextInputFormat extends TestCase { private static final Log LOG = LogFactory.getLog(TestLzoTextInputFormat.class.getName()); private MessageDigest md5; private String lzoFileName = "part-r-00001" + new LzopCodec().getDefaultExtension(); private Path outputDir; //test both bigger outputs and small one chunk ones private static final int OUTPUT_BIG = 10485760; private static final int OUTPUT_SMALL = 50000; @Override protected void setUp() throws Exception { super.setUp(); md5 = MessageDigest.getInstance("MD5"); Path testBuildData = new Path(System.getProperty("test.build.data", "data")); outputDir = new Path(testBuildData, "outputDir"); } /** * Make sure the lzo index class works as described. */ public void testLzoIndex() { LzoIndex index = new LzoIndex(); assertTrue(index.isEmpty()); index = new LzoIndex(4); index.set(0, 0); index.set(1, 5); index.set(2, 10); index.set(3, 15); assertFalse(index.isEmpty()); assertEquals(0, index.findNextPosition(-1)); assertEquals(5, index.findNextPosition(1)); assertEquals(5, index.findNextPosition(5)); assertEquals(15, index.findNextPosition(11)); assertEquals(15, index.findNextPosition(15)); assertEquals(-1, index.findNextPosition(16)); } /** * Index the file and make sure it splits properly. * * @throws NoSuchAlgorithmException * @throws IOException * @throws InterruptedException */ public void testWithIndex() throws NoSuchAlgorithmException, IOException, InterruptedException { runTest(true, OUTPUT_BIG); runTest(true, OUTPUT_SMALL); } /** * Don't index the file and make sure it can be processed anyway. * * @throws NoSuchAlgorithmException * @throws IOException * @throws InterruptedException */ public void testWithoutIndex() throws NoSuchAlgorithmException, IOException, InterruptedException { runTest(false, OUTPUT_BIG); runTest(false, OUTPUT_SMALL); } /** * Generate random data, compress it, index and md5 hash the data. * Then read it all back and md5 that too, to verify that it all went ok. * * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { if (!GPLNativeCodeLoader.isNativeCodeLoaded()) { LOG.warn("Cannot run this test without the native lzo libraries"); return; } Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir, true); localFs.mkdirs(outputDir); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir); TaskAttemptContext attemptContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID("123", 0, TaskType.REDUCE, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir, lzoFileName); LzoTextInputFormat.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir); List<InputSplit> is = inputFormat.getSplits(job); //verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5.digest())); } /** * Creates an lzo file with random data. * * @param outputDir Output directory. * @param fs File system we're using. * @param attemptContext Task attempt context, contains task id etc. * @throws IOException * @throws InterruptedException */ private byte[] createTestInput(Path outputDir, FileSystem fs, TaskAttemptContext attemptContext, int charsToOutput) throws IOException, InterruptedException { TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>(); RecordWriter<Text, Text> rw = null; md5.reset(); try { rw = output.getRecordWriter(attemptContext); char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6".toCharArray(); Random r = new Random(System.currentTimeMillis()); Text key = new Text(); Text value = new Text(); int charsMax = chars.length - 1; for (int i = 0; i < charsToOutput;) { i += fillText(chars, r, charsMax, key); i += fillText(chars, r, charsMax, value); rw.write(key, value); md5.update(key.getBytes(), 0, key.getLength()); // text output format writes tab between the key and value md5.update("\t".getBytes("UTF-8")); md5.update(value.getBytes(), 0, value.getLength()); } } finally { if (rw != null) { rw.close(attemptContext); OutputCommitter committer = output.getOutputCommitter(attemptContext); committer.commitTask(attemptContext); committer.cleanupJob(attemptContext); } } byte[] result = md5.digest(); md5.reset(); return result; } private int fillText(char[] chars, Random r, int charsMax, Text text) { StringBuilder sb = new StringBuilder(); // get a reasonable string length int stringLength = r.nextInt(charsMax * 2); for (int j = 0; j < stringLength; j++) { sb.append(chars[r.nextInt(charsMax)]); } text.set(sb.toString()); return stringLength; } }