com.cloudera.crunch.WordCountHBaseTest.java Source code

Introduction

Here is the source code for com.cloudera.crunch.WordCountHBaseTest.java
Source

/**
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.crunch;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.IOException;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import com.cloudera.crunch.impl.mr.MRPipeline;
import com.cloudera.crunch.io.hbase.HBaseSourceTarget;
import com.cloudera.crunch.io.hbase.HBaseTarget;
import com.cloudera.crunch.lib.Aggregate;
import com.cloudera.crunch.type.writable.Writables;

public class WordCountHBaseTest {
    protected static final Log LOG = LogFactory.getLog(WordCountHBaseTest.class);

    private static final byte[] COUNTS_COLFAM = Bytes.toBytes("cf");
    private static final byte[] WORD_COLFAM = Bytes.toBytes("cf");

    private HBaseTestingUtility hbaseTestUtil = new HBaseTestingUtility();

    @SuppressWarnings("serial")
    public static PCollection<Put> wordCount(PTable<ImmutableBytesWritable, Result> words) {
        PTable<String, Long> counts = Aggregate
                .count(words.parallelDo(new DoFn<Pair<ImmutableBytesWritable, Result>, String>() {
                    @Override
                    public void process(Pair<ImmutableBytesWritable, Result> row, Emitter<String> emitter) {
                        byte[] word = row.second().getValue(WORD_COLFAM, null);
                        if (word != null) {
                            emitter.emit(Bytes.toString(word));
                        }
                    }
                }, words.getTypeFamily().strings()));

        return counts.parallelDo("convert to put", new DoFn<Pair<String, Long>, Put>() {
            @Override
            public void process(Pair<String, Long> input, Emitter<Put> emitter) {
                Put put = new Put(Bytes.toBytes(input.first()));
                put.add(COUNTS_COLFAM, null, Bytes.toBytes(input.second()));
                emitter.emit(put);
            }

        }, Writables.writables(Put.class));
    }

    @Before
    public void setUp() throws Exception {
        Configuration conf = hbaseTestUtil.getConfiguration();
        File tmpDir = File.createTempFile("logdir", "");
        tmpDir.delete();
        tmpDir.mkdir();
        tmpDir.deleteOnExit();
        conf.set("hadoop.log.dir", tmpDir.getAbsolutePath());
        conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/1");
        conf.setInt("hbase.master.info.port", -1);
        conf.setInt("hbase.regionserver.info.port", -1);
        hbaseTestUtil.startMiniZKCluster();
        hbaseTestUtil.startMiniCluster();
        hbaseTestUtil.startMiniMapReduceCluster();
    }

    @Test
    public void testWordCount() throws IOException {
        run(new MRPipeline(WordCountHBaseTest.class, hbaseTestUtil.getConfiguration()));
    }

    @After
    public void tearDown() throws Exception {
        hbaseTestUtil.shutdownMiniMapReduceCluster();
        hbaseTestUtil.shutdownMiniCluster();
        hbaseTestUtil.shutdownMiniZKCluster();
    }

    public void run(Pipeline pipeline) throws IOException {

        Random rand = new Random();
        int postFix = Math.abs(rand.nextInt());
        String inputTableName = "crunch_words_" + postFix;
        String outputTableName = "crunch_counts_" + postFix;

        try {

            HTable inputTable = hbaseTestUtil.createTable(Bytes.toBytes(inputTableName), WORD_COLFAM);
            HTable outputTable = hbaseTestUtil.createTable(Bytes.toBytes(outputTableName), COUNTS_COLFAM);

            int key = 0;
            key = put(inputTable, key, "cat");
            key = put(inputTable, key, "cat");
            key = put(inputTable, key, "dog");
            Scan scan = new Scan();
            scan.addColumn(WORD_COLFAM, null);
            HBaseSourceTarget source = new HBaseSourceTarget(inputTableName, scan);
            PTable<ImmutableBytesWritable, Result> shakespeare = pipeline.read(source);
            pipeline.write(wordCount(shakespeare), new HBaseTarget(outputTableName));
            pipeline.done();

            assertIsLong(outputTable, "cat", 2);
            assertIsLong(outputTable, "dog", 1);
        } finally {
            // not quite sure...
        }
    }

    protected int put(HTable table, int key, String value) throws IOException {
        Put put = new Put(Bytes.toBytes(key));
        put.add(WORD_COLFAM, null, Bytes.toBytes(value));
        table.put(put);
        return key + 1;
    }

    protected void assertIsLong(HTable table, String key, long i) throws IOException {
        Get get = new Get(Bytes.toBytes(key));
        get.addColumn(COUNTS_COLFAM, null);
        Result result = table.get(get);

        byte[] rawCount = result.getValue(COUNTS_COLFAM, null);
        assertTrue(rawCount != null);
        assertEquals(new Long(i), new Long(Bytes.toLong(rawCount)));
    }
}