Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.crunch.io.hbase; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Map; import java.util.Random; import java.util.jar.JarEntry; import java.util.jar.JarOutputStream; import org.apache.commons.io.FileUtils; import org.apache.crunch.DoFn; import org.apache.crunch.Emitter; import org.apache.crunch.MapFn; import org.apache.crunch.PCollection; import org.apache.crunch.PTable; import org.apache.crunch.Pair; import org.apache.crunch.Pipeline; import org.apache.crunch.impl.mr.MRPipeline; import org.apache.crunch.test.TemporaryPath; import org.apache.crunch.test.TemporaryPaths; import org.apache.crunch.types.writable.Writables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.mapred.TaskAttemptContext; import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableSet; import com.google.common.io.ByteStreams; public class WordCountHBaseIT { static class StringifyFn extends MapFn<Pair<ImmutableBytesWritable, Pair<Result, Result>>, String> { @Override public String map(Pair<ImmutableBytesWritable, Pair<Result, Result>> input) { byte[] firstStrBytes = input.second().first().getValue(WORD_COLFAM, null); byte[] secondStrBytes = input.second().second().getValue(WORD_COLFAM, null); if (firstStrBytes != null && secondStrBytes != null) { return Joiner.on(',').join(new String(firstStrBytes), new String(secondStrBytes)); } return ""; } } @Rule public TemporaryPath tmpDir = TemporaryPaths.create(); private static final byte[] COUNTS_COLFAM = Bytes.toBytes("cf"); private static final byte[] WORD_COLFAM = Bytes.toBytes("cf"); private HBaseTestingUtility hbaseTestUtil = new HBaseTestingUtility(); @SuppressWarnings("serial") public static PCollection<Put> wordCount(PTable<ImmutableBytesWritable, Result> words) { PTable<String, Long> counts = words.parallelDo(new DoFn<Pair<ImmutableBytesWritable, Result>, String>() { @Override public void process(Pair<ImmutableBytesWritable, Result> row, Emitter<String> emitter) { byte[] word = row.second().getValue(WORD_COLFAM, null); if (word != null) { emitter.emit(Bytes.toString(word)); } } }, words.getTypeFamily().strings()).count(); return counts.parallelDo("convert to put", new DoFn<Pair<String, Long>, Put>() { @Override public void process(Pair<String, Long> input, Emitter<Put> emitter) { Put put = new Put(Bytes.toBytes(input.first())); put.add(COUNTS_COLFAM, null, Bytes.toBytes(input.second())); emitter.emit(put); } }, Writables.writables(Put.class)); } @SuppressWarnings("serial") public static PCollection<Delete> clearCounts(PTable<ImmutableBytesWritable, Result> counts) { return counts.parallelDo("convert to delete", new DoFn<Pair<ImmutableBytesWritable, Result>, Delete>() { @Override public void process(Pair<ImmutableBytesWritable, Result> input, Emitter<Delete> emitter) { Delete delete = new Delete(input.first().get()); emitter.emit(delete); } }, Writables.writables(Delete.class)); } @Before public void setUp() throws Exception { Configuration conf = hbaseTestUtil.getConfiguration(); conf.set("hadoop.log.dir", tmpDir.getFileName("logs")); conf.set("hadoop.tmp.dir", tmpDir.getFileName("hadoop-tmp")); conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/1"); conf.setInt("hbase.master.info.port", -1); conf.setInt("hbase.regionserver.info.port", -1); // Workaround for HBASE-5711, we need to set config value dfs.datanode.data.dir.perm // equal to the permissions of the temp dirs on the filesystem. These temp dirs were // probably created using this process' umask. So we guess the temp dir permissions as // 0777 & ~umask, and use that to set the config value. try { Process process = Runtime.getRuntime().exec("/bin/sh -c umask"); BufferedReader br = new BufferedReader(new InputStreamReader(process.getInputStream())); int rc = process.waitFor(); if (rc == 0) { String umask = br.readLine(); int umaskBits = Integer.parseInt(umask, 8); int permBits = 0x1ff & ~umaskBits; String perms = Integer.toString(permBits, 8); conf.set("dfs.datanode.data.dir.perm", perms); } } catch (Exception e) { // ignore errors, we might not be running on POSIX, or "sh" might not be on the path } hbaseTestUtil.startMiniZKCluster(); hbaseTestUtil.startMiniCluster(); hbaseTestUtil.startMiniMapReduceCluster(1); // For Hadoop-2.0.0, we have to do a bit more work. if (TaskAttemptContext.class.isInterface()) { conf = hbaseTestUtil.getConfiguration(); FileSystem fs = FileSystem.get(conf); Path tmpPath = new Path("target", "WordCountHBaseTest-tmpDir"); FileSystem localFS = FileSystem.getLocal(conf); for (FileStatus jarFile : localFS.listStatus(new Path("target/lib/"))) { Path target = new Path(tmpPath, jarFile.getPath().getName()); fs.copyFromLocalFile(jarFile.getPath(), target); DistributedCache.addFileToClassPath(target, conf, fs); } // Create a programmatic container for this jar. JarOutputStream jos = new JarOutputStream(new FileOutputStream("WordCountHBaseIT.jar")); File baseDir = new File("target/test-classes"); String prefix = "org/apache/crunch/io/hbase/"; jarUp(jos, baseDir, prefix + "WordCountHBaseIT.class"); jarUp(jos, baseDir, prefix + "WordCountHBaseIT$1.class"); jarUp(jos, baseDir, prefix + "WordCountHBaseIT$2.class"); jarUp(jos, baseDir, prefix + "WordCountHBaseIT$3.class"); jarUp(jos, baseDir, prefix + "WordCountHBaseIT$StringifyFn.class"); // Now for the OutputFormat (doesn't get copied by default, apparently) baseDir = new File("target/classes"); jarUp(jos, baseDir, prefix + "TableOutputFormat.class"); jarUp(jos, baseDir, prefix + "TableOutputFormat$TableRecordWriter.class"); jos.close(); Path target = new Path(tmpPath, "WordCountHBaseIT.jar"); fs.copyFromLocalFile(true, new Path("WordCountHBaseIT.jar"), target); DistributedCache.addFileToClassPath(target, conf, fs); } } private static void jarUp(JarOutputStream jos, File baseDir, String classDir) throws IOException { File file = new File(baseDir, classDir); JarEntry e = new JarEntry(classDir); e.setTime(file.lastModified()); jos.putNextEntry(e); ByteStreams.copy(new FileInputStream(file), jos); jos.closeEntry(); } @Test public void testWordCount() throws IOException { run(new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration())); } @After public void tearDown() throws Exception { hbaseTestUtil.shutdownMiniMapReduceCluster(); hbaseTestUtil.shutdownMiniCluster(); hbaseTestUtil.shutdownMiniZKCluster(); //Delete the build directory that gets created in the root of the project when starting //the MiniMapReduceCluster FileUtils.deleteDirectory(new File("build")); } public void run(Pipeline pipeline) throws IOException { Random rand = new Random(); int postFix = Math.abs(rand.nextInt()); String inputTableName = "crunch_words_" + postFix; String outputTableName = "crunch_counts_" + postFix; String otherTableName = "crunch_other_" + postFix; String joinTableName = "crunch_join_words_" + postFix; HTable inputTable = hbaseTestUtil.createTable(Bytes.toBytes(inputTableName), WORD_COLFAM); HTable outputTable = hbaseTestUtil.createTable(Bytes.toBytes(outputTableName), COUNTS_COLFAM); HTable otherTable = hbaseTestUtil.createTable(Bytes.toBytes(otherTableName), COUNTS_COLFAM); int key = 0; key = put(inputTable, key, "cat"); key = put(inputTable, key, "cat"); key = put(inputTable, key, "dog"); Scan scan = new Scan(); scan.addFamily(WORD_COLFAM); HBaseSourceTarget source = new HBaseSourceTarget(inputTableName, scan); PTable<ImmutableBytesWritable, Result> words = pipeline.read(source); Map<ImmutableBytesWritable, Result> materialized = words.materializeToMap(); assertEquals(3, materialized.size()); PCollection<Put> puts = wordCount(words); pipeline.write(puts, new HBaseTarget(outputTableName)); pipeline.write(puts, new HBaseTarget(otherTableName)); pipeline.done(); assertIsLong(outputTable, "cat", 2); assertIsLong(outputTable, "dog", 1); assertIsLong(otherTable, "cat", 2); assertIsLong(otherTable, "dog", 1); // verify we can do joins. HTable joinTable = hbaseTestUtil.createTable(Bytes.toBytes(joinTableName), WORD_COLFAM); key = 0; key = put(joinTable, key, "zebra"); key = put(joinTable, key, "donkey"); key = put(joinTable, key, "bird"); key = put(joinTable, key, "horse"); Scan joinScan = new Scan(); joinScan.addFamily(WORD_COLFAM); PTable<ImmutableBytesWritable, Result> other = pipeline.read(FromHBase.table(joinTableName, joinScan)); PCollection<String> joined = words.join(other).parallelDo(new StringifyFn(), Writables.strings()); assertEquals(ImmutableSet.of("cat,zebra", "cat,donkey", "dog,bird"), ImmutableSet.copyOf(joined.materialize())); pipeline.done(); //verify HBaseTarget supports deletes. Scan clearScan = new Scan(); clearScan.addFamily(COUNTS_COLFAM); pipeline = new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration()); HBaseSourceTarget clearSource = new HBaseSourceTarget(outputTableName, clearScan); PTable<ImmutableBytesWritable, Result> counts = pipeline.read(clearSource); pipeline.write(clearCounts(counts), new HBaseTarget(outputTableName)); pipeline.done(); assertDeleted(outputTable, "cat"); assertDeleted(outputTable, "dog"); } protected int put(HTable table, int key, String value) throws IOException { Put put = new Put(Bytes.toBytes(key)); put.add(WORD_COLFAM, null, Bytes.toBytes(value)); table.put(put); return key + 1; } protected static void assertIsLong(HTable table, String key, long i) throws IOException { Get get = new Get(Bytes.toBytes(key)); get.addFamily(COUNTS_COLFAM); Result result = table.get(get); byte[] rawCount = result.getValue(COUNTS_COLFAM, null); assertNotNull(rawCount); assertEquals(i, Bytes.toLong(rawCount)); } protected static void assertDeleted(HTable table, String key) throws IOException { Get get = new Get(Bytes.toBytes(key)); get.addFamily(COUNTS_COLFAM); Result result = table.get(get); assertTrue(result.isEmpty()); } }