org.apache.crunch.SparkHFileTargetIT.java Source code

Introduction

Here is the source code for org.apache.crunch.SparkHFileTargetIT.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import org.apache.commons.io.IOUtils;
import org.apache.crunch.fn.FilterFns;
import org.apache.crunch.impl.mr.run.RuntimeParameters;
import org.apache.crunch.impl.spark.SparkPipeline;
import org.apache.crunch.io.At;
import org.apache.crunch.io.hbase.HBaseTypes;
import org.apache.crunch.io.hbase.HFileUtils;
import org.apache.crunch.io.hbase.ToHBase;
import org.apache.crunch.test.TemporaryPath;
import org.apache.crunch.types.writable.Writables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.Tag;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.regionserver.KeyValueHeap;
import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.regionserver.StoreFileScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;
import java.util.Random;

import static org.apache.crunch.types.writable.Writables.nulls;
import static org.apache.crunch.types.writable.Writables.tableOf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotSame;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

public class SparkHFileTargetIT implements Serializable {

    private static HBaseTestingUtility HBASE_TEST_UTILITY;
    private static final byte[] TEST_FAMILY = Bytes.toBytes("test_family");
    private static final byte[] TEST_QUALIFIER = Bytes.toBytes("count");
    private static final Path TEMP_DIR = new Path("/tmp");
    private static final Random RANDOM = new Random();

    private static final FilterFn<String> SHORT_WORD_FILTER = new FilterFn<String>() {
        @Override
        public boolean accept(String input) {
            return input.length() <= 2;
        }
    };

    @Rule
    public transient TemporaryPath tmpDir = new TemporaryPath(RuntimeParameters.TMP_DIR, "hadoop.tmp.dir");

    @BeforeClass
    public static void setUpClass() throws Exception {
        // We have to use mini mapreduce cluster, because LocalJobRunner allows only a single reducer
        // (we will need it to test bulk load against multiple regions).
        Configuration conf = HBaseConfiguration.create();

        // Workaround for HBASE-5711, we need to set config value dfs.datanode.data.dir.perm
        // equal to the permissions of the temp dirs on the filesystem. These temp dirs were
        // probably created using this process' umask. So we guess the temp dir permissions as
        // 0777 & ~umask, and use that to set the config value.
        Process process = Runtime.getRuntime().exec("/bin/sh -c umask");
        BufferedReader br = new BufferedReader(
                new InputStreamReader(process.getInputStream(), Charset.forName("UTF-8")));
        int rc = process.waitFor();
        if (rc == 0) {
            String umask = br.readLine();

            int umaskBits = Integer.parseInt(umask, 8);
            int permBits = 0777 & ~umaskBits;
            String perms = Integer.toString(permBits, 8);

            conf.set("dfs.datanode.data.dir.perm", perms);
        }

        HBASE_TEST_UTILITY = new HBaseTestingUtility(conf);
        HBASE_TEST_UTILITY.startMiniCluster(1);
    }

    private static HTable createTable(int splits) throws Exception {
        HColumnDescriptor hcol = new HColumnDescriptor(TEST_FAMILY);
        return createTable(splits, hcol);
    }

    private static HTable createTable(int splits, HColumnDescriptor... hcols) throws Exception {
        byte[] tableName = Bytes.toBytes("test_table_" + RANDOM.nextInt(1000000000));
        HBaseAdmin admin = HBASE_TEST_UTILITY.getHBaseAdmin();
        HTableDescriptor htable = new HTableDescriptor(tableName);
        for (HColumnDescriptor hcol : hcols) {
            htable.addFamily(hcol);
        }
        admin.createTable(htable, Bytes.split(Bytes.toBytes("a"), Bytes.toBytes("z"), splits));
        HBASE_TEST_UTILITY.waitTableAvailable(tableName, 30000);
        return new HTable(HBASE_TEST_UTILITY.getConfiguration(), tableName);
    }

    @AfterClass
    public static void tearDownClass() throws Exception {
        HBASE_TEST_UTILITY.shutdownMiniCluster();
    }

    @Before
    public void setUp() throws IOException {
        FileSystem fs = HBASE_TEST_UTILITY.getTestFileSystem();
        fs.delete(TEMP_DIR, true);
    }

    @Test
    public void testHFileTarget() throws Exception {
        Pipeline pipeline = new SparkPipeline("local", "hfile", SparkHFileTargetIT.class,
                HBASE_TEST_UTILITY.getConfiguration());
        Path inputPath = copyResourceFileToHDFS("shakes.txt");
        Path outputPath = getTempPathOnHDFS("out");

        PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
        PCollection<String> words = split(shakespeare, "\\s+");
        PTable<String, Long> wordCounts = words.count();
        pipeline.write(convertToKeyValues(wordCounts), ToHBase.hfile(outputPath));

        PipelineResult result = pipeline.run();
        assertTrue(result.succeeded());

        FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration());
        KeyValue kv = readFromHFiles(fs, outputPath, "and");
        assertEquals(427L, Bytes.toLong(kv.getValue()));
        pipeline.done();
    }

    @Test
    public void testBulkLoad() throws Exception {
        Pipeline pipeline = new SparkPipeline("local", "hfile", SparkHFileTargetIT.class,
                HBASE_TEST_UTILITY.getConfiguration());
        Path inputPath = copyResourceFileToHDFS("shakes.txt");
        Path outputPath = getTempPathOnHDFS("out");
        byte[] columnFamilyA = Bytes.toBytes("colfamA");
        byte[] columnFamilyB = Bytes.toBytes("colfamB");
        HTable testTable = createTable(26, new HColumnDescriptor(columnFamilyA),
                new HColumnDescriptor(columnFamilyB));
        PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
        PCollection<String> words = split(shakespeare, "\\s+");
        PTable<String, Long> wordCounts = words.count();
        PCollection<Put> wordCountPuts = convertToPuts(wordCounts, columnFamilyA, columnFamilyB);
        HFileUtils.writePutsToHFilesForIncrementalLoad(wordCountPuts, testTable, outputPath);

        PipelineResult result = pipeline.run();
        assertTrue(result.succeeded());

        new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration()).doBulkLoad(outputPath, testTable);

        Map<String, Long> EXPECTED = ImmutableMap.<String, Long>builder().put("__EMPTY__", 1470L).put("the", 620L)
                .put("and", 427L).put("of", 396L).put("to", 367L).build();

        for (Map.Entry<String, Long> e : EXPECTED.entrySet()) {
            assertEquals((long) e.getValue(), getWordCountFromTable(testTable, columnFamilyA, e.getKey()));
            assertEquals((long) e.getValue(), getWordCountFromTable(testTable, columnFamilyB, e.getKey()));
        }
        pipeline.done();
    }

    /** See CRUNCH-251 */
    @Test
    public void testMultipleHFileTargets() throws Exception {
        Pipeline pipeline = new SparkPipeline("local", "hfile", SparkHFileTargetIT.class,
                HBASE_TEST_UTILITY.getConfiguration());
        Path inputPath = copyResourceFileToHDFS("shakes.txt");
        Path outputPath1 = getTempPathOnHDFS("out1");
        Path outputPath2 = getTempPathOnHDFS("out2");
        HTable table1 = createTable(26);
        HTable table2 = createTable(26);
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration());

        PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
        PCollection<String> words = split(shakespeare, "\\s+");
        PCollection<String> shortWords = words.filter(SHORT_WORD_FILTER);
        PCollection<String> longWords = words.filter(FilterFns.not(SHORT_WORD_FILTER));
        PTable<String, Long> shortWordCounts = shortWords.count();
        PTable<String, Long> longWordCounts = longWords.count();
        HFileUtils.writePutsToHFilesForIncrementalLoad(convertToPuts(shortWordCounts), table1, outputPath1);
        HFileUtils.writePutsToHFilesForIncrementalLoad(convertToPuts(longWordCounts), table2, outputPath2);

        PipelineResult result = pipeline.run();
        assertTrue(result.succeeded());

        loader.doBulkLoad(outputPath1, table1);
        loader.doBulkLoad(outputPath2, table2);

        assertEquals(396L, getWordCountFromTable(table1, "of"));
        assertEquals(427L, getWordCountFromTable(table2, "and"));
        pipeline.done();
    }

    @Test
    public void testHFileUsesFamilyConfig() throws Exception {
        DataBlockEncoding newBlockEncoding = DataBlockEncoding.PREFIX;
        assertNotSame(newBlockEncoding, DataBlockEncoding.valueOf(HColumnDescriptor.DEFAULT_DATA_BLOCK_ENCODING));

        Pipeline pipeline = new SparkPipeline("local", "hfile", SparkHFileTargetIT.class,
                HBASE_TEST_UTILITY.getConfiguration());
        Path inputPath = copyResourceFileToHDFS("shakes.txt");
        Path outputPath = getTempPathOnHDFS("out");
        HColumnDescriptor hcol = new HColumnDescriptor(TEST_FAMILY);
        hcol.setDataBlockEncoding(newBlockEncoding);
        HTable testTable = createTable(26, hcol);

        PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
        PCollection<String> words = split(shakespeare, "\\s+");
        PTable<String, Long> wordCounts = words.count();
        PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
        HFileUtils.writePutsToHFilesForIncrementalLoad(wordCountPuts, testTable, outputPath);

        PipelineResult result = pipeline.run();
        assertTrue(result.succeeded());

        int hfilesCount = 0;
        Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
        FileSystem fs = outputPath.getFileSystem(conf);
        for (FileStatus e : fs.listStatus(new Path(outputPath, Bytes.toString(TEST_FAMILY)))) {
            Path f = e.getPath();
            if (!f.getName().startsWith("part-")) { // filter out "_SUCCESS"
                continue;
            }
            HFile.Reader reader = null;
            try {
                reader = HFile.createReader(fs, f, new CacheConfig(conf), conf);
                assertEquals(DataBlockEncoding.PREFIX, reader.getDataBlockEncoding());
            } finally {
                if (reader != null) {
                    reader.close();
                }
            }
            hfilesCount++;
        }
        assertTrue(hfilesCount > 0);
        pipeline.done();
    }

    private static PCollection<Put> convertToPuts(PTable<String, Long> in) {
        return convertToPuts(in, TEST_FAMILY);
    }

    private static PCollection<Put> convertToPuts(PTable<String, Long> in, final byte[]... columnFamilies) {
        return in.parallelDo(new MapFn<Pair<String, Long>, Put>() {
            @Override
            public Put map(Pair<String, Long> input) {
                String w = input.first();
                if (w.length() == 0) {
                    w = "__EMPTY__";
                }
                long c = input.second();
                Put p = new Put(Bytes.toBytes(w));
                for (byte[] columnFamily : columnFamilies) {
                    p.add(columnFamily, TEST_QUALIFIER, Bytes.toBytes(c));
                }
                return p;
            }
        }, HBaseTypes.puts());
    }

    private static PCollection<KeyValue> convertToKeyValues(PTable<String, Long> in) {
        return in.parallelDo(new MapFn<Pair<String, Long>, Pair<KeyValue, Void>>() {
            @Override
            public Pair<KeyValue, Void> map(Pair<String, Long> input) {
                String w = input.first();
                if (w.length() == 0) {
                    w = "__EMPTY__";
                }
                long c = input.second();
                Cell cell = CellUtil.createCell(Bytes.toBytes(w), Bytes.toBytes(c));
                return Pair.of(KeyValue.cloneAndAddTags(cell, ImmutableList.<Tag>of()), null);
            }
        }, tableOf(HBaseTypes.keyValues(), nulls()))
                .groupByKey(
                        GroupingOptions.builder().sortComparatorClass(HFileUtils.KeyValueComparator.class).build())
                .ungroup().keys();
    }

    private static PCollection<String> split(PCollection<String> in, final String regex) {
        return in.parallelDo(new DoFn<String, String>() {
            @Override
            public void process(String input, Emitter<String> emitter) {
                for (String w : input.split(regex)) {
                    emitter.emit(w);
                }
            }
        }, Writables.strings());
    }

    /** Reads the first value on a given row from a bunch of hfiles. */
    private static KeyValue readFromHFiles(FileSystem fs, Path mrOutputPath, String row) throws IOException {
        List<KeyValueScanner> scanners = Lists.newArrayList();
        KeyValue fakeKV = KeyValue.createFirstOnRow(Bytes.toBytes(row));
        for (FileStatus e : fs.listStatus(mrOutputPath)) {
            Path f = e.getPath();
            if (!f.getName().startsWith("part-")) { // filter out "_SUCCESS"
                continue;
            }
            StoreFile.Reader reader = new StoreFile.Reader(fs, f, new CacheConfig(fs.getConf()), fs.getConf());
            StoreFileScanner scanner = reader.getStoreFileScanner(false, false);
            scanner.seek(fakeKV); // have to call seek of each underlying scanner, otherwise KeyValueHeap won't work
            scanners.add(scanner);
        }
        assertTrue(!scanners.isEmpty());
        KeyValueScanner kvh = new KeyValueHeap(scanners, KeyValue.COMPARATOR);
        boolean seekOk = kvh.seek(fakeKV);
        assertTrue(seekOk);
        Cell kv = kvh.next();
        kvh.close();
        return KeyValue.cloneAndAddTags(kv, ImmutableList.<Tag>of());
    }

    private static Path copyResourceFileToHDFS(String resourceName) throws IOException {
        Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
        FileSystem fs = FileSystem.get(conf);
        Path resultPath = getTempPathOnHDFS(resourceName);
        InputStream in = null;
        OutputStream out = null;
        try {
            in = Resources.getResource(resourceName).openConnection().getInputStream();
            out = fs.create(resultPath);
            IOUtils.copy(in, out);
        } finally {
            IOUtils.closeQuietly(in);
            IOUtils.closeQuietly(out);
        }
        return resultPath;
    }

    private static Path getTempPathOnHDFS(String fileName) throws IOException {
        Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
        FileSystem fs = FileSystem.get(conf);
        Path result = new Path(TEMP_DIR, fileName);
        return result.makeQualified(fs);
    }

    private static long getWordCountFromTable(HTable table, String word) throws IOException {
        return getWordCountFromTable(table, TEST_FAMILY, word);
    }

    private static long getWordCountFromTable(HTable table, byte[] columnFamily, String word) throws IOException {
        Get get = new Get(Bytes.toBytes(word));
        get.addFamily(columnFamily);
        byte[] value = table.get(get).value();
        if (value == null) {
            fail("no such row: " + word);
        }
        return Bytes.toLong(value);
    }
}