org.apache.crunch.io.hbase.WordCountHBaseIT.java Source code

Introduction

Here is the source code for org.apache.crunch.io.hbase.WordCountHBaseIT.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.io.hbase;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Map;
import java.util.Random;
import java.util.jar.JarEntry;
import java.util.jar.JarOutputStream;

import org.apache.commons.io.FileUtils;
import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.MapFn;
import org.apache.crunch.PCollection;
import org.apache.crunch.PTable;
import org.apache.crunch.Pair;
import org.apache.crunch.Pipeline;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.test.TemporaryPath;
import org.apache.crunch.test.TemporaryPaths;
import org.apache.crunch.types.writable.Writables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;

import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.ByteStreams;

public class WordCountHBaseIT {

    static class StringifyFn extends MapFn<Pair<ImmutableBytesWritable, Pair<Result, Result>>, String> {
        @Override
        public String map(Pair<ImmutableBytesWritable, Pair<Result, Result>> input) {
            byte[] firstStrBytes = input.second().first().getValue(WORD_COLFAM, null);
            byte[] secondStrBytes = input.second().second().getValue(WORD_COLFAM, null);
            if (firstStrBytes != null && secondStrBytes != null) {
                return Joiner.on(',').join(new String(firstStrBytes), new String(secondStrBytes));
            }
            return "";
        }
    }

    @Rule
    public TemporaryPath tmpDir = TemporaryPaths.create();

    private static final byte[] COUNTS_COLFAM = Bytes.toBytes("cf");
    private static final byte[] WORD_COLFAM = Bytes.toBytes("cf");

    private HBaseTestingUtility hbaseTestUtil = new HBaseTestingUtility();

    @SuppressWarnings("serial")
    public static PCollection<Put> wordCount(PTable<ImmutableBytesWritable, Result> words) {
        PTable<String, Long> counts = words.parallelDo(new DoFn<Pair<ImmutableBytesWritable, Result>, String>() {
            @Override
            public void process(Pair<ImmutableBytesWritable, Result> row, Emitter<String> emitter) {
                byte[] word = row.second().getValue(WORD_COLFAM, null);
                if (word != null) {
                    emitter.emit(Bytes.toString(word));
                }
            }
        }, words.getTypeFamily().strings()).count();

        return counts.parallelDo("convert to put", new DoFn<Pair<String, Long>, Put>() {
            @Override
            public void process(Pair<String, Long> input, Emitter<Put> emitter) {
                Put put = new Put(Bytes.toBytes(input.first()));
                put.add(COUNTS_COLFAM, null, Bytes.toBytes(input.second()));
                emitter.emit(put);
            }

        }, Writables.writables(Put.class));
    }

    @SuppressWarnings("serial")
    public static PCollection<Delete> clearCounts(PTable<ImmutableBytesWritable, Result> counts) {
        return counts.parallelDo("convert to delete", new DoFn<Pair<ImmutableBytesWritable, Result>, Delete>() {
            @Override
            public void process(Pair<ImmutableBytesWritable, Result> input, Emitter<Delete> emitter) {
                Delete delete = new Delete(input.first().get());
                emitter.emit(delete);
            }

        }, Writables.writables(Delete.class));
    }

    @Before
    public void setUp() throws Exception {
        Configuration conf = hbaseTestUtil.getConfiguration();
        conf.set("hadoop.log.dir", tmpDir.getFileName("logs"));
        conf.set("hadoop.tmp.dir", tmpDir.getFileName("hadoop-tmp"));
        conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/1");
        conf.setInt("hbase.master.info.port", -1);
        conf.setInt("hbase.regionserver.info.port", -1);

        // Workaround for HBASE-5711, we need to set config value dfs.datanode.data.dir.perm
        // equal to the permissions of the temp dirs on the filesystem. These temp dirs were
        // probably created using this process' umask. So we guess the temp dir permissions as
        // 0777 & ~umask, and use that to set the config value.
        try {
            Process process = Runtime.getRuntime().exec("/bin/sh -c umask");
            BufferedReader br = new BufferedReader(new InputStreamReader(process.getInputStream()));
            int rc = process.waitFor();
            if (rc == 0) {
                String umask = br.readLine();

                int umaskBits = Integer.parseInt(umask, 8);
                int permBits = 0x1ff & ~umaskBits;
                String perms = Integer.toString(permBits, 8);

                conf.set("dfs.datanode.data.dir.perm", perms);
            }
        } catch (Exception e) {
            // ignore errors, we might not be running on POSIX, or "sh" might not be on the path
        }

        hbaseTestUtil.startMiniZKCluster();
        hbaseTestUtil.startMiniCluster();
        hbaseTestUtil.startMiniMapReduceCluster(1);

        // For Hadoop-2.0.0, we have to do a bit more work.
        if (TaskAttemptContext.class.isInterface()) {
            conf = hbaseTestUtil.getConfiguration();
            FileSystem fs = FileSystem.get(conf);
            Path tmpPath = new Path("target", "WordCountHBaseTest-tmpDir");
            FileSystem localFS = FileSystem.getLocal(conf);
            for (FileStatus jarFile : localFS.listStatus(new Path("target/lib/"))) {
                Path target = new Path(tmpPath, jarFile.getPath().getName());
                fs.copyFromLocalFile(jarFile.getPath(), target);
                DistributedCache.addFileToClassPath(target, conf, fs);
            }

            // Create a programmatic container for this jar.
            JarOutputStream jos = new JarOutputStream(new FileOutputStream("WordCountHBaseIT.jar"));
            File baseDir = new File("target/test-classes");
            String prefix = "org/apache/crunch/io/hbase/";
            jarUp(jos, baseDir, prefix + "WordCountHBaseIT.class");
            jarUp(jos, baseDir, prefix + "WordCountHBaseIT$1.class");
            jarUp(jos, baseDir, prefix + "WordCountHBaseIT$2.class");
            jarUp(jos, baseDir, prefix + "WordCountHBaseIT$3.class");
            jarUp(jos, baseDir, prefix + "WordCountHBaseIT$StringifyFn.class");

            // Now for the OutputFormat (doesn't get copied by default, apparently)
            baseDir = new File("target/classes");
            jarUp(jos, baseDir, prefix + "TableOutputFormat.class");
            jarUp(jos, baseDir, prefix + "TableOutputFormat$TableRecordWriter.class");
            jos.close();

            Path target = new Path(tmpPath, "WordCountHBaseIT.jar");
            fs.copyFromLocalFile(true, new Path("WordCountHBaseIT.jar"), target);
            DistributedCache.addFileToClassPath(target, conf, fs);
        }
    }

    private static void jarUp(JarOutputStream jos, File baseDir, String classDir) throws IOException {
        File file = new File(baseDir, classDir);
        JarEntry e = new JarEntry(classDir);
        e.setTime(file.lastModified());
        jos.putNextEntry(e);
        ByteStreams.copy(new FileInputStream(file), jos);
        jos.closeEntry();
    }

    @Test
    public void testWordCount() throws IOException {
        run(new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration()));
    }

    @After
    public void tearDown() throws Exception {
        hbaseTestUtil.shutdownMiniMapReduceCluster();
        hbaseTestUtil.shutdownMiniCluster();
        hbaseTestUtil.shutdownMiniZKCluster();

        //Delete the build directory that gets created in the root of the project when starting
        //the MiniMapReduceCluster
        FileUtils.deleteDirectory(new File("build"));
    }

    public void run(Pipeline pipeline) throws IOException {

        Random rand = new Random();
        int postFix = Math.abs(rand.nextInt());
        String inputTableName = "crunch_words_" + postFix;
        String outputTableName = "crunch_counts_" + postFix;
        String otherTableName = "crunch_other_" + postFix;
        String joinTableName = "crunch_join_words_" + postFix;

        HTable inputTable = hbaseTestUtil.createTable(Bytes.toBytes(inputTableName), WORD_COLFAM);
        HTable outputTable = hbaseTestUtil.createTable(Bytes.toBytes(outputTableName), COUNTS_COLFAM);
        HTable otherTable = hbaseTestUtil.createTable(Bytes.toBytes(otherTableName), COUNTS_COLFAM);

        int key = 0;
        key = put(inputTable, key, "cat");
        key = put(inputTable, key, "cat");
        key = put(inputTable, key, "dog");
        Scan scan = new Scan();
        scan.addFamily(WORD_COLFAM);
        HBaseSourceTarget source = new HBaseSourceTarget(inputTableName, scan);
        PTable<ImmutableBytesWritable, Result> words = pipeline.read(source);

        Map<ImmutableBytesWritable, Result> materialized = words.materializeToMap();
        assertEquals(3, materialized.size());

        PCollection<Put> puts = wordCount(words);
        pipeline.write(puts, new HBaseTarget(outputTableName));
        pipeline.write(puts, new HBaseTarget(otherTableName));
        pipeline.done();

        assertIsLong(outputTable, "cat", 2);
        assertIsLong(outputTable, "dog", 1);
        assertIsLong(otherTable, "cat", 2);
        assertIsLong(otherTable, "dog", 1);

        // verify we can do joins.
        HTable joinTable = hbaseTestUtil.createTable(Bytes.toBytes(joinTableName), WORD_COLFAM);
        key = 0;
        key = put(joinTable, key, "zebra");
        key = put(joinTable, key, "donkey");
        key = put(joinTable, key, "bird");
        key = put(joinTable, key, "horse");

        Scan joinScan = new Scan();
        joinScan.addFamily(WORD_COLFAM);
        PTable<ImmutableBytesWritable, Result> other = pipeline.read(FromHBase.table(joinTableName, joinScan));
        PCollection<String> joined = words.join(other).parallelDo(new StringifyFn(), Writables.strings());
        assertEquals(ImmutableSet.of("cat,zebra", "cat,donkey", "dog,bird"),
                ImmutableSet.copyOf(joined.materialize()));
        pipeline.done();

        //verify HBaseTarget supports deletes.
        Scan clearScan = new Scan();
        clearScan.addFamily(COUNTS_COLFAM);
        pipeline = new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration());
        HBaseSourceTarget clearSource = new HBaseSourceTarget(outputTableName, clearScan);
        PTable<ImmutableBytesWritable, Result> counts = pipeline.read(clearSource);
        pipeline.write(clearCounts(counts), new HBaseTarget(outputTableName));
        pipeline.done();

        assertDeleted(outputTable, "cat");
        assertDeleted(outputTable, "dog");
    }

    protected int put(HTable table, int key, String value) throws IOException {
        Put put = new Put(Bytes.toBytes(key));
        put.add(WORD_COLFAM, null, Bytes.toBytes(value));
        table.put(put);
        return key + 1;
    }

    protected static void assertIsLong(HTable table, String key, long i) throws IOException {
        Get get = new Get(Bytes.toBytes(key));
        get.addFamily(COUNTS_COLFAM);
        Result result = table.get(get);

        byte[] rawCount = result.getValue(COUNTS_COLFAM, null);
        assertNotNull(rawCount);
        assertEquals(i, Bytes.toLong(rawCount));
    }

    protected static void assertDeleted(HTable table, String key) throws IOException {
        Get get = new Get(Bytes.toBytes(key));
        get.addFamily(COUNTS_COLFAM);
        Result result = table.get(get);
        assertTrue(result.isEmpty());
    }

}