org.apache.kylin.provision.BuildCubeWithStream.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.kylin.provision.BuildCubeWithStream.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package org.apache.kylin.provision;

import static java.lang.Thread.sleep;

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Random;
import java.util.TimeZone;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;

import org.I0Itec.zkclient.ZkConnection;
import org.apache.commons.lang3.StringUtils;
import org.apache.curator.RetryPolicy;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.retry.ExponentialBackoffRetry;
import org.apache.kafka.common.requests.MetadataResponse;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.ClassUtil;
import org.apache.kylin.common.util.HBaseMetadataTestCase;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.CubeUpdate;
import org.apache.kylin.engine.EngineFactory;
import org.apache.kylin.job.DeployUtil;
import org.apache.kylin.job.engine.JobEngineConfig;
import org.apache.kylin.job.execution.AbstractExecutable;
import org.apache.kylin.job.execution.DefaultChainedExecutable;
import org.apache.kylin.job.execution.ExecutableManager;
import org.apache.kylin.job.execution.ExecutableState;
import org.apache.kylin.job.impl.threadpool.DefaultScheduler;
import org.apache.kylin.job.streaming.Kafka10DataLoader;
import org.apache.kylin.metadata.model.SegmentStatusEnum;
import org.apache.kylin.metadata.streaming.StreamingConfig;
import org.apache.kylin.metadata.streaming.StreamingManager;
import org.apache.kylin.source.ISource;
import org.apache.kylin.source.SourceFactory;
import org.apache.kylin.source.SourcePartition;
import org.apache.kylin.source.kafka.KafkaConfigManager;
import org.apache.kylin.source.kafka.config.BrokerConfig;
import org.apache.kylin.source.kafka.config.KafkaConfig;
import org.apache.kylin.storage.hbase.util.ZookeeperJobLock;
import org.apache.kylin.storage.hbase.util.ZookeeperUtil;
import org.apache.kylin.rest.job.StorageCleanupJob;
import org.junit.Assert;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;

/**
 *  for streaming cubing case "test_streaming_table"
 */
public class BuildCubeWithStream {

    private static final Logger logger = LoggerFactory
            .getLogger(org.apache.kylin.provision.BuildCubeWithStream.class);

    protected CubeManager cubeManager;
    private DefaultScheduler scheduler;
    protected ExecutableManager jobService;
    static final String cubeName = "test_streaming_table_cube";

    private KafkaConfig kafkaConfig;
    private MockKafka kafkaServer;
    private ZkConnection zkConnection;
    private final String kafkaZkPath = "/" + UUID.randomUUID().toString();

    protected static boolean fastBuildMode = false;
    private volatile boolean generateData = true;
    private volatile boolean generateDataDone = false;

    private static final int BUILD_ROUND = 5;

    public void before() throws Exception {
        deployEnv();

        String fastModeStr = System.getProperty("fastBuildMode");
        if (fastModeStr != null && fastModeStr.equalsIgnoreCase("true")) {
            fastBuildMode = true;
            logger.info("Will use fast build mode");
        } else {
            logger.info("Will not use fast build mode");
        }

        final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        jobService = ExecutableManager.getInstance(kylinConfig);
        scheduler = DefaultScheduler.createInstance();
        scheduler.init(new JobEngineConfig(kylinConfig), new ZookeeperJobLock());
        if (!scheduler.hasStarted()) {
            throw new RuntimeException("scheduler has not been started");
        }
        cubeManager = CubeManager.getInstance(kylinConfig);

        final CubeInstance cubeInstance = CubeManager.getInstance(kylinConfig).getCube(cubeName);
        final String factTable = cubeInstance.getRootFactTable();

        final StreamingManager streamingManager = StreamingManager.getInstance(kylinConfig);
        final StreamingConfig streamingConfig = streamingManager.getStreamingConfig(factTable);
        kafkaConfig = KafkaConfigManager.getInstance(kylinConfig).getKafkaConfig(streamingConfig.getName());

        String topicName = UUID.randomUUID().toString();
        String localIp = NetworkUtils.getLocalIp();
        BrokerConfig brokerConfig = kafkaConfig.getKafkaClusterConfigs().get(0).getBrokerConfigs().get(0);
        brokerConfig.setHost(localIp);
        kafkaConfig.setTopic(topicName);
        KafkaConfigManager.getInstance(kylinConfig).saveKafkaConfig(kafkaConfig);

        startEmbeddedKafka(topicName, brokerConfig);
    }

    private void startEmbeddedKafka(String topicName, BrokerConfig brokerConfig) {
        //Start mock Kakfa
        String zkConnectionStr = ZookeeperUtil.getZKConnectString() + kafkaZkPath;
        System.out.println("zkConnectionStr" + zkConnectionStr);
        zkConnection = new ZkConnection(zkConnectionStr);
        // Assert.assertEquals(ZooKeeper.States.CONNECTED, zkConnection.getZookeeperState());
        kafkaServer = new MockKafka(zkConnection, brokerConfig.getPort(), brokerConfig.getId());
        kafkaServer.start();

        kafkaServer.createTopic(topicName, 3, 1);
        kafkaServer.waitTopicUntilReady(topicName);

        MetadataResponse.TopicMetadata topicMetadata = kafkaServer.fetchTopicMeta(topicName);
        Assert.assertEquals(topicName, topicMetadata.topic());
    }

    protected void generateStreamData(long startTime, long endTime, int numberOfRecords) throws IOException {
        Kafka10DataLoader dataLoader = new Kafka10DataLoader(kafkaConfig);
        DeployUtil.prepareTestDataForStreamingCube(startTime, endTime, numberOfRecords, cubeName, dataLoader);
        logger.info("Test data inserted into Kafka");
    }

    protected void clearSegment(String cubeName) throws Exception {
        CubeInstance cube = cubeManager.getCube(cubeName);
        // remove all existing segments
        CubeUpdate cubeBuilder = new CubeUpdate(cube);
        cubeBuilder.setToRemoveSegs(cube.getSegments().toArray(new CubeSegment[cube.getSegments().size()]));
        cubeManager.updateCube(cubeBuilder);
    }

    public void build() throws Exception {
        clearSegment(cubeName);
        new Thread(new Runnable() {
            @Override
            public void run() {
                SimpleDateFormat f = new SimpleDateFormat("yyyy-MM-dd");
                f.setTimeZone(TimeZone.getTimeZone("GMT"));
                long dateStart = 0;
                try {
                    dateStart = f.parse("2012-01-01").getTime();
                } catch (ParseException e) {
                }
                Random rand = new Random();
                while (generateData == true) {
                    long dateEnd = dateStart + 7 * 24 * 3600000;
                    try {
                        generateStreamData(dateStart, dateEnd, rand.nextInt(100));
                        dateStart = dateEnd;
                        sleep(rand.nextInt(rand.nextInt(30)) * 1000); // wait random time
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
                generateDataDone = true;
            }
        }).start();
        ExecutorService executorService = Executors.newCachedThreadPool();

        List<FutureTask<ExecutableState>> futures = Lists.newArrayList();
        for (int i = 0; i < BUILD_ROUND; i++) {
            if (i == (BUILD_ROUND - 1)) {
                // stop generating message to kafka
                generateData = false;
                int waittime = 0;
                while (generateDataDone == false && waittime < 100) {
                    Thread.sleep(1000);
                    waittime++;
                }
                if (generateDataDone == false) {
                    throw new IllegalStateException("Timeout when wait all messages be sent to Kafka"); // ensure all messages have been flushed.
                }
            } else {
                Thread.sleep(30 * 1000); // wait for new messages
            }

            FutureTask futureTask = new FutureTask(new Callable<ExecutableState>() {
                @Override
                public ExecutableState call() {
                    ExecutableState result = null;
                    try {
                        result = buildSegment(cubeName, 0, Long.MAX_VALUE);
                    } catch (Exception e) {
                        // previous build hasn't been started, or other case.
                        e.printStackTrace();
                    }

                    return result;
                }
            });

            executorService.submit(futureTask);
            futures.add(futureTask);
        }

        generateData = false;
        executorService.shutdown();
        int succeedBuild = 0;
        for (int i = 0; i < futures.size(); i++) {
            ExecutableState result = futures.get(i).get(20, TimeUnit.MINUTES);
            logger.info("Checking building task " + i + " whose state is " + result);
            Assert.assertTrue(
                    result == null || result == ExecutableState.SUCCEED || result == ExecutableState.DISCARDED);
            if (result == ExecutableState.SUCCEED)
                succeedBuild++;
        }

        logger.info(succeedBuild + " build jobs have been successfully completed.");
        List<CubeSegment> segments = cubeManager.getCube(cubeName).getSegments(SegmentStatusEnum.READY);
        Assert.assertTrue(segments.size() == succeedBuild);

        if (fastBuildMode == false) {
            long endOffset = segments.get(segments.size() - 1).getSourceOffsetEnd();
            //merge
            ExecutableState result = mergeSegment(cubeName, 0, endOffset);
            Assert.assertTrue(result == ExecutableState.SUCCEED);

            segments = cubeManager.getCube(cubeName).getSegments();
            Assert.assertTrue(segments.size() == 1);

            CubeSegment toRefreshSeg = segments.get(0);

            refreshSegment(cubeName, toRefreshSeg.getSourceOffsetStart(), toRefreshSeg.getSourceOffsetEnd());
            segments = cubeManager.getCube(cubeName).getSegments();
            Assert.assertTrue(segments.size() == 1);
        }

        logger.info("Build is done");
    }

    private ExecutableState mergeSegment(String cubeName, long startOffset, long endOffset) throws Exception {
        CubeSegment segment = cubeManager.mergeSegments(cubeManager.getCube(cubeName), 0, 0, startOffset, endOffset,
                false);
        DefaultChainedExecutable job = EngineFactory.createBatchMergeJob(segment, "TEST");
        jobService.addJob(job);
        waitForJob(job.getId());
        return job.getStatus();
    }

    private String refreshSegment(String cubeName, long startOffset, long endOffset) throws Exception {
        CubeSegment segment = cubeManager.refreshSegment(cubeManager.getCube(cubeName), 0, 0, startOffset,
                endOffset);
        DefaultChainedExecutable job = EngineFactory.createBatchCubingJob(segment, "TEST");
        jobService.addJob(job);
        waitForJob(job.getId());
        return job.getId();
    }

    protected ExecutableState buildSegment(String cubeName, long startOffset, long endOffset) throws Exception {
        CubeInstance cubeInstance = cubeManager.getCube(cubeName);
        ISource source = SourceFactory.getSource(cubeInstance);
        SourcePartition partition = source.enrichSourcePartitionBeforeBuild(cubeInstance,
                new SourcePartition(0, 0, startOffset, endOffset, null, null));
        CubeSegment segment = cubeManager.appendSegment(cubeManager.getCube(cubeName), partition);
        DefaultChainedExecutable job = EngineFactory.createBatchCubingJob(segment, "TEST");
        jobService.addJob(job);
        waitForJob(job.getId());
        return job.getStatus();
    }

    protected void deployEnv() throws IOException {
        DeployUtil.overrideJobJarLocations();
        //                DeployUtil.initCliWorkDir();
        //                DeployUtil.deployMetadata();
    }

    public static void beforeClass() throws Exception {
        logger.info("Adding to classpath: " + new File(HBaseMetadataTestCase.SANDBOX_TEST_DATA).getAbsolutePath());
        ClassUtil.addClasspath(new File(HBaseMetadataTestCase.SANDBOX_TEST_DATA).getAbsolutePath());
        System.setProperty(KylinConfig.KYLIN_CONF, HBaseMetadataTestCase.SANDBOX_TEST_DATA);
        if (StringUtils.isEmpty(System.getProperty("hdp.version"))) {
            throw new RuntimeException(
                    "No hdp.version set; Please set hdp.version in your jvm option, for example: -Dhdp.version=2.4.0.0-169");
        }
        HBaseMetadataTestCase.staticCreateTestMetadata(HBaseMetadataTestCase.SANDBOX_TEST_DATA);
    }

    public void after() {
        kafkaServer.stop();
        cleanKafkaZkPath(kafkaZkPath);
        DefaultScheduler.destroyInstance();
    }

    private void cleanKafkaZkPath(String path) {
        RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3);
        CuratorFramework zkClient = CuratorFrameworkFactory.newClient(ZookeeperUtil.getZKConnectString(),
                retryPolicy);
        zkClient.start();

        try {
            zkClient.delete().deletingChildrenIfNeeded().forPath(kafkaZkPath);
        } catch (Exception e) {
            logger.warn("Failed to delete zookeeper path: " + path, e);
        } finally {
            zkClient.close();
        }
    }

    protected void waitForJob(String jobId) {
        while (true) {
            AbstractExecutable job = jobService.getJob(jobId);
            if (job.getStatus() == ExecutableState.SUCCEED || job.getStatus() == ExecutableState.ERROR
                    || job.getStatus() == ExecutableState.DISCARDED) {
                break;
            } else {
                try {
                    Thread.sleep(5000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    public void cleanup() throws Exception {
        cleanupOldStorage();
        HBaseMetadataTestCase.staticCleanupTestMetadata();
    }

    protected void cleanupOldStorage() throws Exception {
        String[] args = { "--delete", "true" };
        StorageCleanupJob cli = new StorageCleanupJob();
        cli.execute(args);
    }

    public static void main(String[] args) throws Exception {
        long start = System.currentTimeMillis();
        int exitCode = 0;

        BuildCubeWithStream buildCubeWithStream = null;
        try {
            beforeClass();
            buildCubeWithStream = new BuildCubeWithStream();
            buildCubeWithStream.before();
            buildCubeWithStream.build();
            logger.info("Build is done");

            buildCubeWithStream.after();
            buildCubeWithStream.cleanup();
            logger.info("Going to exit");
        } catch (Throwable e) {
            logger.error("error", e);
            exitCode = 1;
        }

        long millis = System.currentTimeMillis() - start;
        System.out.println("Time elapsed: " + (millis / 1000) + " sec - in " + BuildCubeWithStream.class.getName());

        System.exit(exitCode);
    }
}