org.apache.kylin.storage.hbase.steps.SparkCubeHFile.java Source code

Introduction

Here is the source code for org.apache.kylin.storage.hbase.steps.SparkCubeHFile.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/
package org.apache.kylin.storage.hbase.steps;

import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import com.google.common.collect.Maps;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.AbstractApplication;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.common.util.OptionsHelper;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.cube.model.HBaseColumnDesc;
import org.apache.kylin.cube.model.HBaseColumnFamilyDesc;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.engine.mr.common.SerializableConfiguration;
import org.apache.kylin.engine.spark.KylinSparkJobListener;
import org.apache.kylin.engine.spark.SparkUtil;
import org.apache.kylin.job.constant.ExecutableConstants;
import org.apache.kylin.measure.MeasureCodec;
import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;

import scala.Tuple2;

/**
 * Spark application to build cube with the "by-layer" algorithm. Only support source data from Hive; Metadata in HBase.
 */
public class SparkCubeHFile extends AbstractApplication implements Serializable {

    protected static final Logger logger = LoggerFactory.getLogger(SparkCubeHFile.class);

    public static final Option OPTION_CUBE_NAME = OptionBuilder.withArgName(BatchConstants.ARG_CUBE_NAME).hasArg()
            .isRequired(true).withDescription("Cube Name").create(BatchConstants.ARG_CUBE_NAME);
    public static final Option OPTION_SEGMENT_ID = OptionBuilder.withArgName("segment").hasArg().isRequired(true)
            .withDescription("Cube Segment Id").create("segmentId");
    public static final Option OPTION_META_URL = OptionBuilder.withArgName("metaUrl").hasArg().isRequired(true)
            .withDescription("HDFS metadata url").create("metaUrl");
    public static final Option OPTION_OUTPUT_PATH = OptionBuilder.withArgName(BatchConstants.ARG_OUTPUT).hasArg()
            .isRequired(true).withDescription("HFile output path").create(BatchConstants.ARG_OUTPUT);
    public static final Option OPTION_INPUT_PATH = OptionBuilder.withArgName(BatchConstants.ARG_INPUT).hasArg()
            .isRequired(true).withDescription("Cuboid files PATH").create(BatchConstants.ARG_INPUT);
    public static final Option OPTION_PARTITION_FILE_PATH = OptionBuilder.withArgName(BatchConstants.ARG_PARTITION)
            .hasArg().isRequired(true).withDescription("Partition file path.").create(BatchConstants.ARG_PARTITION);
    public static final Option OPTION_COUNTER_PATH = OptionBuilder.withArgName(BatchConstants.ARG_COUNTER_OUPUT)
            .hasArg().isRequired(true).withDescription("counter output path")
            .create(BatchConstants.ARG_COUNTER_OUPUT);

    private Options options;

    public SparkCubeHFile() {
        options = new Options();
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_META_URL);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_PARTITION_FILE_PATH);
        options.addOption(AbstractHadoopJob.OPTION_HBASE_CONF_PATH);
        options.addOption(OPTION_COUNTER_PATH);
    }

    @Override
    protected Options getOptions() {
        return options;
    }

    @Override
    protected void execute(OptionsHelper optionsHelper) throws Exception {
        final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
        final String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
        final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
        final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
        final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
        final Path partitionFilePath = new Path(optionsHelper.getOptionValue(OPTION_PARTITION_FILE_PATH));
        final String hbaseConfFile = optionsHelper.getOptionValue(AbstractHadoopJob.OPTION_HBASE_CONF_PATH);
        final String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

        Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"),
                KeyValueCreator.class, KeyValue.class, RowKeyWritable.class };

        SparkConf conf = new SparkConf().setAppName("Converting HFile for:" + cubeName + " segment " + segmentId);
        //serialization conf
        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
        conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
        conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

        KylinSparkJobListener jobListener = new KylinSparkJobListener();
        try (JavaSparkContext sc = new JavaSparkContext(conf)) {
            sc.sc().addSparkListener(jobListener);
            final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration());
            if (!fs.exists(partitionFilePath)) {
                throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString());
            }

            HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
            final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());

            final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

            final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
            final CubeDesc cubeDesc = cubeInstance.getDescriptor();
            final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

            final MeasureCodec inputCodec = new MeasureCodec(cubeDesc.getMeasures());
            final List<KeyValueCreator> keyValueCreators = Lists.newArrayList();

            for (HBaseColumnFamilyDesc cfDesc : cubeDesc.getHbaseMapping().getColumnFamily()) {
                for (HBaseColumnDesc colDesc : cfDesc.getColumns()) {
                    keyValueCreators.add(new KeyValueCreator(cubeDesc, colDesc));
                }
            }

            final int cfNum = keyValueCreators.size();
            final boolean quickPath = (keyValueCreators.size() == 1) && keyValueCreators.get(0).isFullCopy;

            logger.info("Input path: {}", inputPath);
            logger.info("Output path: {}", outputPath);
            // read partition split keys
            List<RowKeyWritable> keys = new ArrayList<>();
            try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, partitionFilePath,
                    sc.hadoopConfiguration())) {
                RowKeyWritable key = new RowKeyWritable();
                Writable value = NullWritable.get();
                while (reader.next(key, value)) {
                    keys.add(key);
                    logger.info(" ------- split key: {}", key);
                    key = new RowKeyWritable(); // important, new an object!
                }
            }

            logger.info("There are {} split keys, totally {} hfiles", keys.size(), (keys.size() + 1));

            //HBase conf
            logger.info("Loading HBase configuration from:{}", hbaseConfFile);
            final Path hbaseConfFilePath = new Path(hbaseConfFile);
            final FileSystem hbaseClusterFs = hbaseConfFilePath.getFileSystem(sc.hadoopConfiguration());

            try (FSDataInputStream confInput = hbaseClusterFs.open(new Path(hbaseConfFile))) {
                Configuration hbaseJobConf = new Configuration();
                hbaseJobConf.addResource(confInput);
                hbaseJobConf.set("spark.hadoop.dfs.replication", "3"); // HFile, replication=3
                Job job = Job.getInstance(hbaseJobConf, cubeSegment.getStorageLocationIdentifier());

                FileOutputFormat.setOutputPath(job, new Path(outputPath));

                // inputPath has the same FileSystem as hbaseClusterFs when in HBase standalone mode
                JavaPairRDD<Text, Text> inputRDDs = SparkUtil.parseInputPath(inputPath, hbaseClusterFs, sc,
                        Text.class, Text.class);
                final JavaPairRDD<RowKeyWritable, KeyValue> hfilerdd;
                if (quickPath) {
                    hfilerdd = inputRDDs
                            .mapToPair(new PairFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() {
                                @Override
                                public Tuple2<RowKeyWritable, KeyValue> call(Tuple2<Text, Text> textTextTuple2)
                                        throws Exception {
                                    KeyValue outputValue = keyValueCreators.get(0).create(textTextTuple2._1,
                                            textTextTuple2._2.getBytes(), 0, textTextTuple2._2.getLength());
                                    return new Tuple2<>(
                                            new RowKeyWritable(outputValue.createKeyOnly(false).getKey()),
                                            outputValue);
                                }
                            });
                } else {
                    hfilerdd = inputRDDs
                            .flatMapToPair(new PairFlatMapFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() {
                                @Override
                                public Iterator<Tuple2<RowKeyWritable, KeyValue>> call(
                                        Tuple2<Text, Text> textTextTuple2) throws Exception {

                                    List<Tuple2<RowKeyWritable, KeyValue>> result = Lists
                                            .newArrayListWithExpectedSize(cfNum);
                                    Object[] inputMeasures = new Object[cubeDesc.getMeasures().size()];
                                    inputCodec.decode(ByteBuffer.wrap(textTextTuple2._2.getBytes(), 0,
                                            textTextTuple2._2.getLength()), inputMeasures);

                                    for (int i = 0; i < cfNum; i++) {
                                        KeyValue outputValue = keyValueCreators.get(i).create(textTextTuple2._1,
                                                inputMeasures);
                                        result.add(new Tuple2<>(
                                                new RowKeyWritable(outputValue.createKeyOnly(false).getKey()),
                                                outputValue));
                                    }

                                    return result.iterator();
                                }
                            });
                }

                hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys),
                        RowKeyWritable.RowKeyComparator.INSTANCE)
                        .mapToPair(
                                new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
                                    @Override
                                    public Tuple2<ImmutableBytesWritable, KeyValue> call(
                                            Tuple2<RowKeyWritable, KeyValue> rowKeyWritableKeyValueTuple2)
                                            throws Exception {
                                        return new Tuple2<>(
                                                new ImmutableBytesWritable(
                                                        rowKeyWritableKeyValueTuple2._2.getKey()),
                                                rowKeyWritableKeyValueTuple2._2);
                                    }
                                })
                        .saveAsNewAPIHadoopDataset(job.getConfiguration());
            }

            logger.info("HDFS: Number of bytes written={}", jobListener.metrics.getBytesWritten());

            Map<String, String> counterMap = Maps.newHashMap();
            counterMap.put(ExecutableConstants.HDFS_BYTES_WRITTEN,
                    String.valueOf(jobListener.metrics.getBytesWritten()));

            // save counter to hdfs
            HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        }
    }

    static class HFilePartitioner extends Partitioner {
        private List<RowKeyWritable> keys;

        public HFilePartitioner(List splitKyes) {
            keys = splitKyes;
        }

        @Override
        public int numPartitions() {
            return keys.size() + 1;
        }

        @Override
        public int getPartition(Object o) {
            int pos = Collections.binarySearch(this.keys, (RowKeyWritable) o) + 1;
            return pos < 0 ? -pos : pos;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o)
                return true;
            if (o == null || getClass() != o.getClass())
                return false;
            HFilePartitioner that = (HFilePartitioner) o;
            return Objects.equals(keys, that.keys);
        }

        @Override
        public int hashCode() {
            return Objects.hash(keys);
        }
    }

}