org.apache.kylin.engine.spark.SparkCubing.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.kylin.engine.spark.SparkCubing.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/
package org.apache.kylin.engine.spark;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.UUID;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;

import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FsShell;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.ToolRunner;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.AbstractApplication;
import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.ClassUtil;
import org.apache.kylin.common.util.Dictionary;
import org.apache.kylin.common.util.OptionsHelper;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.CubeUpdate;
import org.apache.kylin.cube.cuboid.Cuboid;
import org.apache.kylin.cube.cuboid.CuboidScheduler;
import org.apache.kylin.cube.inmemcubing.AbstractInMemCubeBuilder;
import org.apache.kylin.cube.inmemcubing.DoggedCubeBuilder;
import org.apache.kylin.cube.kv.CubeDimEncMap;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.cube.model.CubeJoinedFlatTableEnrich;
import org.apache.kylin.cube.model.DimensionDesc;
import org.apache.kylin.cube.model.RowKeyDesc;
import org.apache.kylin.cube.util.CubingUtils;
import org.apache.kylin.dict.DictionaryGenerator;
import org.apache.kylin.dict.IterableDictionaryValueEnumerator;
import org.apache.kylin.engine.EngineFactory;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.engine.mr.common.CubeStatsReader;
import org.apache.kylin.engine.spark.cube.BufferedCuboidWriter;
import org.apache.kylin.engine.spark.cube.DefaultTupleConverter;
import org.apache.kylin.engine.spark.util.IteratorUtils;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.MeasureAggregators;
import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.IJoinedFlatTableDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
import org.apache.kylin.metadata.model.SegmentStatusEnum;
import org.apache.kylin.metadata.model.TblColRef;
import org.apache.kylin.metadata.realization.RealizationStatusEnum;
import org.apache.kylin.storage.hbase.HBaseConnection;
import org.apache.kylin.storage.hbase.steps.CreateHTableJob;
import org.apache.kylin.storage.hbase.steps.CubeHTableUtil;
import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkFiles;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;
import com.google.common.primitives.UnsignedBytes;

import scala.Tuple2;

/**
 */
public class SparkCubing extends AbstractApplication {

    protected static final Logger logger = LoggerFactory.getLogger(SparkCubing.class);

    private static final Option OPTION_INPUT_PATH = OptionBuilder.withArgName("path").hasArg().isRequired(true)
            .withDescription("Hive Intermediate Table").create("hiveTable");
    private static final Option OPTION_CUBE_NAME = OptionBuilder.withArgName(BatchConstants.ARG_CUBE_NAME).hasArg()
            .isRequired(true).withDescription("Cube Name").create(BatchConstants.ARG_CUBE_NAME);
    private static final Option OPTION_SEGMENT_ID = OptionBuilder.withArgName("segment").hasArg().isRequired(true)
            .withDescription("Cube Segment Id").create("segmentId");
    private static final Option OPTION_CONF_PATH = OptionBuilder.withArgName("confPath").hasArg().isRequired(true)
            .withDescription("Configuration Path").create("confPath");
    private static final Option OPTION_COPROCESSOR = OptionBuilder.withArgName("coprocessor").hasArg()
            .isRequired(true).withDescription("Coprocessor Jar Path").create("coprocessor");

    private Options options;

    public SparkCubing() {
        options = new Options();
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_CONF_PATH);
        options.addOption(OPTION_COPROCESSOR);

    }

    @Override
    protected Options getOptions() {
        return options;
    }

    public static KylinConfig loadKylinPropsAndMetadata(String folder) throws IOException {
        File metaDir = new File(folder);
        if (!metaDir.getAbsolutePath().equals(System.getProperty(KylinConfig.KYLIN_CONF))) {
            System.setProperty(KylinConfig.KYLIN_CONF, metaDir.getAbsolutePath());
            logger.info("The absolute path for meta dir is " + metaDir.getAbsolutePath());
            KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
            System.out.println("setting metadataUrl to " + metaDir.getAbsolutePath());
            kylinConfig.setMetadataUrl(metaDir.getAbsolutePath());
            return kylinConfig;
        } else {
            return KylinConfig.getInstanceFromEnv();
        }
    }

    private void setupClasspath(JavaSparkContext sc, String confPath) throws Exception {
        ClassUtil.addClasspath(confPath);
        final File[] files = new File(confPath).listFiles(new FileFilter() {
            @Override
            public boolean accept(File pathname) {
                if (pathname.getAbsolutePath().endsWith(".xml")) {
                    return true;
                }
                if (pathname.getAbsolutePath().endsWith(".properties")) {
                    return true;
                }
                return false;
            }
        });
        if (files == null) {
            return;
        }
        for (File file : files) {
            sc.addFile(file.getAbsolutePath());
        }
    }

    private void writeDictionary(DataFrame intermediateTable, String cubeName, String segmentId) throws Exception {
        final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        final CubeManager cubeManager = CubeManager.getInstance(kylinConfig);
        final CubeInstance cubeInstance = cubeManager.reloadCubeLocal(cubeName);
        final String[] columns = intermediateTable.columns();
        final CubeSegment seg = cubeInstance.getSegmentById(segmentId);
        final CubeDesc cubeDesc = cubeInstance.getDescriptor();
        final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
        final CubeJoinedFlatTableEnrich flatDesc = new CubeJoinedFlatTableEnrich(
                EngineFactory.getJoinedFlatTableDesc(seg), cubeDesc);
        final List<TblColRef> baseCuboidColumn = Cuboid.findById(cubeDesc, Cuboid.getBaseCuboidId(cubeDesc))
                .getColumns();
        final long start = System.currentTimeMillis();
        final RowKeyDesc rowKey = cubeDesc.getRowkey();
        for (int i = 0; i < baseCuboidColumn.size(); i++) {
            TblColRef col = baseCuboidColumn.get(i);
            if (!rowKey.isUseDictionary(col)) {
                continue;
            }
            final int rowKeyColumnIndex = flatDesc.getRowKeyColumnIndexes()[i];
            tblColRefMap.put(rowKeyColumnIndex, col);
        }

        Map<TblColRef, Dictionary<String>> dictionaryMap = Maps.newHashMap();
        for (Map.Entry<Integer, TblColRef> entry : tblColRefMap.entrySet()) {
            final String column = columns[entry.getKey()];
            final TblColRef tblColRef = entry.getValue();
            final DataFrame frame = intermediateTable.select(column).distinct();

            final Row[] rows = frame.collect();
            dictionaryMap.put(tblColRef, DictionaryGenerator.buildDictionary(tblColRef.getType(),
                    new IterableDictionaryValueEnumerator(new Iterable<String>() {
                        @Override
                        public Iterator<String> iterator() {
                            return new Iterator<String>() {
                                int i = 0;

                                @Override
                                public boolean hasNext() {
                                    return i < rows.length;
                                }

                                @Override
                                public String next() {
                                    if (hasNext()) {
                                        final Row row = rows[i++];
                                        final Object o = row.get(0);
                                        return o != null ? o.toString() : null;
                                    } else {
                                        throw new NoSuchElementException();
                                    }
                                }

                                @Override
                                public void remove() {
                                    throw new UnsupportedOperationException();
                                }
                            };
                        }
                    })));
        }
        final long end = System.currentTimeMillis();
        CubingUtils.writeDictionary(seg, dictionaryMap, start, end);
        try {
            CubeUpdate cubeBuilder = new CubeUpdate(cubeInstance);
            cubeBuilder.setToUpdateSegs(seg);
            cubeManager.updateCube(cubeBuilder);
        } catch (IOException e) {
            throw new RuntimeException("Failed to deal with the request: " + e.getLocalizedMessage());
        }
    }

    private Map<Long, HLLCounter> sampling(final JavaRDD<List<String>> rowJavaRDD, final String cubeName,
            String segmentId) throws Exception {
        CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv())
                .reloadCubeLocal(cubeName);
        CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
        CubeDesc cubeDesc = cubeInstance.getDescriptor();
        CuboidScheduler cuboidScheduler = new CuboidScheduler(cubeDesc);
        List<Long> allCuboidIds = cuboidScheduler.getAllCuboidIds();
        final HashMap<Long, HLLCounter> zeroValue = Maps.newHashMap();
        for (Long id : allCuboidIds) {
            zeroValue.put(id, new HLLCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
        }

        CubeJoinedFlatTableEnrich flatDesc = new CubeJoinedFlatTableEnrich(
                EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc);

        final int[] rowKeyColumnIndexes = flatDesc.getRowKeyColumnIndexes();
        final int nRowKey = cubeDesc.getRowkey().getRowKeyColumns().length;
        final long baseCuboidId = Cuboid.getBaseCuboidId(cubeDesc);
        final Map<Long, Integer[]> allCuboidsBitSet = Maps.newHashMapWithExpectedSize(allCuboidIds.size());
        final ByteArray[] row_hashcodes = new ByteArray[nRowKey];

        for (Long cuboidId : allCuboidIds) {
            Integer[] cuboidBitSet = new Integer[Long.bitCount(cuboidId)];

            long mask = Long.highestOneBit(baseCuboidId);
            int position = 0;
            for (int i = 0; i < nRowKey; i++) {
                if ((mask & cuboidId) > 0) {
                    cuboidBitSet[position] = i;
                    position++;
                }
                mask = mask >> 1;
            }
            allCuboidsBitSet.put(cuboidId, cuboidBitSet);
        }
        for (int i = 0; i < nRowKey; ++i) {
            row_hashcodes[i] = new ByteArray();
        }

        final HashMap<Long, HLLCounter> samplingResult = rowJavaRDD.aggregate(zeroValue,
                new Function2<HashMap<Long, HLLCounter>, List<String>, HashMap<Long, HLLCounter>>() {

                    final HashFunction hashFunction = Hashing.murmur3_128();

                    @Override
                    public HashMap<Long, HLLCounter> call(HashMap<Long, HLLCounter> v1, List<String> v2)
                            throws Exception {
                        for (int i = 0; i < nRowKey; i++) {
                            Hasher hc = hashFunction.newHasher();
                            String colValue = v2.get(rowKeyColumnIndexes[i]);
                            if (colValue != null) {
                                row_hashcodes[i].set(hc.putString(colValue).hash().asBytes());
                            } else {
                                row_hashcodes[i].set(hc.putInt(0).hash().asBytes());
                            }
                        }

                        for (Map.Entry<Long, Integer[]> entry : allCuboidsBitSet.entrySet()) {
                            Hasher hc = hashFunction.newHasher();
                            HLLCounter counter = v1.get(entry.getKey());
                            final Integer[] cuboidBitSet = entry.getValue();
                            for (int position = 0; position < cuboidBitSet.length; position++) {
                                hc.putBytes(row_hashcodes[cuboidBitSet[position]].array());
                            }
                            counter.add(hc.hash().asBytes());
                        }
                        return v1;
                    }
                },
                new Function2<HashMap<Long, HLLCounter>, HashMap<Long, HLLCounter>, HashMap<Long, HLLCounter>>() {
                    @Override
                    public HashMap<Long, HLLCounter> call(HashMap<Long, HLLCounter> v1,
                            HashMap<Long, HLLCounter> v2) throws Exception {
                        Preconditions.checkArgument(v1.size() == v2.size());
                        Preconditions.checkArgument(v1.size() > 0);
                        for (Map.Entry<Long, HLLCounter> entry : v1.entrySet()) {
                            final HLLCounter counter1 = entry.getValue();
                            final HLLCounter counter2 = v2.get(entry.getKey());
                            counter1.merge(Preconditions.checkNotNull(counter2, "counter cannot be null"));
                        }
                        return v1;
                    }

                });
        return samplingResult;
    }

    /** return hfile location */
    private String build(JavaRDD<List<String>> javaRDD, final String cubeName, final String segmentId,
            final byte[][] splitKeys) throws Exception {
        CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName);
        CubeDesc cubeDesc = cubeInstance.getDescriptor();
        final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
        List<TblColRef> baseCuboidColumn = Cuboid.findById(cubeDesc, Cuboid.getBaseCuboidId(cubeDesc)).getColumns();
        final Map<TblColRef, Integer> columnLengthMap = Maps.newHashMap();
        final CubeDimEncMap dimEncMap = cubeSegment.getDimensionEncodingMap();
        for (TblColRef tblColRef : baseCuboidColumn) {
            columnLengthMap.put(tblColRef, dimEncMap.get(tblColRef).getLengthOfEncoding());
        }
        final Map<TblColRef, Dictionary<String>> dictionaryMap = Maps.newHashMap();
        for (DimensionDesc dim : cubeDesc.getDimensions()) {
            // dictionary
            for (TblColRef col : dim.getColumnRefs()) {
                if (cubeDesc.getRowkey().isUseDictionary(col)) {
                    Dictionary<String> dict = cubeSegment.getDictionary(col);
                    if (dict == null) {
                        System.err.println("Dictionary for " + col + " was not found.");
                        continue;
                    }
                    dictionaryMap.put(col, dict);
                    System.out.println("col:" + col + " dictionary size:" + dict.getSize());
                }
            }
        }

        for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
            FunctionDesc func = measureDesc.getFunction();
            List<TblColRef> colRefs = func.getMeasureType().getColumnsNeedDictionary(func);
            for (TblColRef col : colRefs) {
                dictionaryMap.put(col, cubeSegment.getDictionary(col));
            }
        }

        final JavaPairRDD<byte[], byte[]> javaPairRDD = javaRDD.glom()
                .mapPartitionsToPair(new PairFlatMapFunction<Iterator<List<List<String>>>, byte[], byte[]>() {

                    @Override
                    public Iterable<Tuple2<byte[], byte[]>> call(Iterator<List<List<String>>> listIterator)
                            throws Exception {
                        long t = System.currentTimeMillis();
                        prepare();

                        final CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv())
                                .getCube(cubeName);

                        LinkedBlockingQueue<List<String>> blockingQueue = new LinkedBlockingQueue();
                        System.out.println("load properties finished");
                        IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(cubeSegment);
                        AbstractInMemCubeBuilder inMemCubeBuilder = new DoggedCubeBuilder(
                                cubeInstance.getDescriptor(), flatDesc, dictionaryMap);
                        final SparkCuboidWriter sparkCuboidWriter = new BufferedCuboidWriter(
                                new DefaultTupleConverter(cubeInstance.getSegmentById(segmentId), columnLengthMap));
                        Executors.newCachedThreadPool()
                                .submit(inMemCubeBuilder.buildAsRunnable(blockingQueue, sparkCuboidWriter));
                        try {
                            while (listIterator.hasNext()) {
                                for (List<String> row : listIterator.next()) {
                                    blockingQueue.put(row);
                                }
                            }
                            blockingQueue.put(Collections.<String>emptyList());
                        } catch (Exception e) {
                            throw new RuntimeException(e);
                        }
                        System.out.println("build partition cost: " + (System.currentTimeMillis() - t) + "ms");
                        return sparkCuboidWriter.getResult();
                    }
                });

        KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        Configuration conf = getConfigurationForHFile(cubeSegment.getStorageLocationIdentifier());
        Path path = new Path(kylinConfig.getHdfsWorkingDirectory(), "hfile_" + UUID.randomUUID().toString());
        Preconditions.checkArgument(!FileSystem.get(conf).exists(path));
        String url = conf.get("fs.defaultFS") + path.toString();
        System.out.println("use " + url + " as hfile");
        List<MeasureDesc> measuresDescs = cubeDesc.getMeasures();
        final int measureSize = measuresDescs.size();
        final String[] dataTypes = new String[measureSize];
        for (int i = 0; i < dataTypes.length; i++) {
            dataTypes[i] = measuresDescs.get(i).getFunction().getReturnType();
        }
        final MeasureAggregators aggs = new MeasureAggregators(measuresDescs);
        writeToHFile2(javaPairRDD, dataTypes, measureSize, aggs, splitKeys, conf, url);
        return url;
    }

    private void writeToHFile2(final JavaPairRDD<byte[], byte[]> javaPairRDD, final String[] dataTypes,
            final int measureSize, final MeasureAggregators aggs, final byte[][] splitKeys,
            final Configuration conf, final String hFileLocation) {
        javaPairRDD.repartitionAndSortWithinPartitions(new Partitioner() {
            @Override
            public int numPartitions() {
                return splitKeys.length + 1;
            }

            @Override
            public int getPartition(Object key) {
                Preconditions.checkArgument(key instanceof byte[]);
                for (int i = 0, n = splitKeys.length; i < n; ++i) {
                    if (UnsignedBytes.lexicographicalComparator().compare((byte[]) key, splitKeys[i]) < 0) {
                        return i;
                    }
                }
                return splitKeys.length;
            }
        }, UnsignedBytes.lexicographicalComparator())
                .mapPartitions(new FlatMapFunction<Iterator<Tuple2<byte[], byte[]>>, Tuple2<byte[], byte[]>>() {
                    @Override
                    public Iterable<Tuple2<byte[], byte[]>> call(
                            final Iterator<Tuple2<byte[], byte[]>> tuple2Iterator) throws Exception {
                        return new Iterable<Tuple2<byte[], byte[]>>() {
                            final BufferedMeasureCodec codec = new BufferedMeasureCodec(dataTypes);
                            final Object[] input = new Object[measureSize];
                            final Object[] result = new Object[measureSize];

                            @Override
                            public Iterator<Tuple2<byte[], byte[]>> iterator() {
                                return IteratorUtils.merge(tuple2Iterator,
                                        UnsignedBytes.lexicographicalComparator(),
                                        new Function<Iterable<byte[]>, byte[]>() {
                                            @Override
                                            public byte[] call(Iterable<byte[]> v1) throws Exception {
                                                final LinkedList<byte[]> list = Lists.newLinkedList(v1);
                                                if (list.size() == 1) {
                                                    return list.get(0);
                                                }
                                                aggs.reset();
                                                for (byte[] v : list) {
                                                    codec.decode(ByteBuffer.wrap(v), input);
                                                    aggs.aggregate(input);
                                                }
                                                aggs.collectStates(result);
                                                ByteBuffer buffer = codec.encode(result);
                                                byte[] bytes = new byte[buffer.position()];
                                                System.arraycopy(buffer.array(), buffer.arrayOffset(), bytes, 0,
                                                        buffer.position());
                                                return bytes;
                                            }
                                        });
                            }
                        };
                    }
                }, true).mapToPair(new PairFunction<Tuple2<byte[], byte[]>, ImmutableBytesWritable, KeyValue>() {
                    @Override
                    public Tuple2<ImmutableBytesWritable, KeyValue> call(Tuple2<byte[], byte[]> tuple2)
                            throws Exception {
                        ImmutableBytesWritable key = new ImmutableBytesWritable(tuple2._1());
                        KeyValue value = new KeyValue(tuple2._1(), "F1".getBytes(), "M".getBytes(), tuple2._2());
                        return new Tuple2(key, value);
                    }
                }).saveAsNewAPIHadoopFile(hFileLocation, ImmutableBytesWritable.class, KeyValue.class,
                        HFileOutputFormat.class, conf);
    }

    public static void prepare() throws Exception {
        final File file = new File(SparkFiles.get("kylin.properties"));
        final String confPath = file.getParentFile().getAbsolutePath();
        System.out.println("conf directory:" + confPath);
        System.setProperty(KylinConfig.KYLIN_CONF, confPath);
        ClassUtil.addClasspath(confPath);
    }

    private byte[][] createHTable(String cubeName, String segmentId, Map<Long, HLLCounter> samplingResult)
            throws Exception {
        final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        final CubeInstance cubeInstance = CubeManager.getInstance(kylinConfig).getCube(cubeName);
        final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
        final Map<Long, Long> rowCountMap = CubeStatsReader.getCuboidRowCountMapFromSampling(samplingResult, 100);
        final Map<Long, Double> cubeSizeMap = CubeStatsReader.getCuboidSizeMapFromRowCount(cubeSegment,
                rowCountMap);
        System.out.println("cube size estimation:" + cubeSizeMap);
        final byte[][] splitKeys = CreateHTableJob.getRegionSplitsFromCuboidStatistics(cubeSizeMap, kylinConfig,
                cubeSegment, null); //FIXME: passing non-null value for 'hfileSplitsOutputFolder'
        CubeHTableUtil.createHTable(cubeSegment, splitKeys);
        System.out.println(cubeSegment.getStorageLocationIdentifier() + " table created");
        return splitKeys;
    }

    private Configuration getConfigurationForHFile(String hTableName) throws IOException {
        final Configuration conf = HBaseConnection.getCurrentHBaseConfiguration();
        Job job = Job.getInstance(conf);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);
        HTable table = new HTable(conf, hTableName);
        HFileOutputFormat.configureIncrementalLoad(job, table);
        return conf;
    }

    private void bulkLoadHFile(String cubeName, String segmentId, String hfileLocation) throws Exception {
        final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        final CubeInstance cubeInstance = CubeManager.getInstance(kylinConfig).getCube(cubeName);
        final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
        final Configuration hbaseConf = HBaseConnection.getCurrentHBaseConfiguration();

        FsShell shell = new FsShell(hbaseConf);
        try {
            shell.run(new String[] { "-chmod", "-R", "777", hfileLocation });
        } catch (Exception e) {
            logger.error("Couldnt change the file permissions ", e);
            throw new IOException(e);
        }

        String[] newArgs = new String[2];
        newArgs[0] = hfileLocation;
        newArgs[1] = cubeSegment.getStorageLocationIdentifier();

        int ret = ToolRunner.run(new LoadIncrementalHFiles(hbaseConf), newArgs);
        System.out.println("incremental load result:" + ret);

        cubeSegment.setStatus(SegmentStatusEnum.READY);
        try {
            CubeUpdate cubeBuilder = new CubeUpdate(cubeInstance);
            cubeInstance.setStatus(RealizationStatusEnum.READY);
            cubeSegment.setStatus(SegmentStatusEnum.READY);
            cubeBuilder.setToUpdateSegs(cubeSegment);
            CubeManager.getInstance(kylinConfig).updateCube(cubeBuilder);
        } catch (IOException e) {
            throw new RuntimeException("Failed to deal with the request: " + e.getLocalizedMessage());
        }
    }

    @Override
    protected void execute(OptionsHelper optionsHelper) throws Exception {
        final String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
        SparkConf conf = new SparkConf().setAppName("Simple Application");
        //memory conf
        conf.set("spark.executor.memory", "6g");
        conf.set("spark.storage.memoryFraction", "0.3");

        //serialization conf
        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
        conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
        conf.set("spark.kryo.registrationRequired", "true");

        JavaSparkContext sc = new JavaSparkContext(conf);
        HiveContext sqlContext = new HiveContext(sc.sc());
        final DataFrame intermediateTable = sqlContext.sql("select * from " + hiveTable);
        final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
        final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
        final String confPath = optionsHelper.getOptionValue(OPTION_CONF_PATH);
        final String coprocessor = optionsHelper.getOptionValue(OPTION_COPROCESSOR);
        final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        kylinConfig.overrideCoprocessorLocalJar(coprocessor);

        setupClasspath(sc, confPath);
        intermediateTable.cache();
        writeDictionary(intermediateTable, cubeName, segmentId);
        final JavaRDD<List<String>> rowJavaRDD = intermediateTable.javaRDD()
                .map(new org.apache.spark.api.java.function.Function<Row, List<String>>() {
                    @Override
                    public List<String> call(Row v1) throws Exception {
                        ArrayList<String> result = Lists.newArrayListWithExpectedSize(v1.size());
                        for (int i = 0; i < v1.size(); i++) {
                            final Object o = v1.get(i);
                            if (o != null) {
                                result.add(o.toString());
                            } else {
                                result.add(null);
                            }
                        }
                        return result;

                    }
                });

        final Map<Long, HLLCounter> samplingResult = sampling(rowJavaRDD, cubeName, segmentId);
        final byte[][] splitKeys = createHTable(cubeName, segmentId, samplingResult);

        final String hfile = build(rowJavaRDD, cubeName, segmentId, splitKeys);
        bulkLoadHFile(cubeName, segmentId, hfile);
    }

}