Java tutorial
/* * Copyright 2015 eBay Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.gzinga.hadoop; import java.io.BufferedReader; import java.io.File; import java.io.InputStreamReader; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer; import io.gzinga.GZipOutputStreamRandomAccess; import org.testng.Assert; import org.testng.annotations.Test; public class TestSplittableGZipCodec { @Test public void testSplittableGZipCodec() { try { Configuration conf = new Configuration(); conf.set("fs.defaultFS", "file:///"); FileSystem fs = FileSystem.get(conf); fs.mkdirs(new Path("target/test")); GZipOutputStreamRandomAccess gzip = new GZipOutputStreamRandomAccess( fs.create(new Path("target/test/testfile1.gz"))); String str = "This is line\n"; for (int i = 1; i <= 10000; i++) { gzip.write(str.getBytes()); if (i % 100 == 0) { gzip.addOffset(i / 100l); } } Assert.assertEquals(gzip.getOffsetMap().size(), 100); gzip.close(); conf.set("mapreduce.framework.name", "local"); conf.set("io.compression.codecs", "io.gzinga.hadoop.SplittableGZipCodec"); conf.set("mapreduce.input.fileinputformat.split.maxsize", "20000"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(WordCount.TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path("target/test/testfile1.gz")); FileOutputFormat.setOutputPath(job, new Path("target/test/testfile2")); job.waitForCompletion(true); BufferedReader br = new BufferedReader( new InputStreamReader(fs.open(new Path("target/test/testfile2/part-r-00000")))); Assert.assertEquals("This\t10000", br.readLine()); Assert.assertEquals("is\t10000", br.readLine()); Assert.assertEquals("line\t10000", br.readLine()); br.close(); } catch (Exception e) { e.printStackTrace(); Assert.fail(); } finally { FileUtil.fullyDelete(new File("target/test/testfile2")); FileUtil.fullyDelete(new File("target/test/testfile1.gz")); } } }