org.apache.helix.taskexecution.TaskExecutionDemo.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.helix.taskexecution.TaskExecutionDemo.java

Source

package org.apache.helix.taskexecution;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.io.File;
import java.util.Random;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;

import org.I0Itec.zkclient.IDefaultNameSpace;
import org.I0Itec.zkclient.ZkClient;
import org.I0Itec.zkclient.ZkServer;
import org.apache.commons.io.FileUtils;
import org.apache.helix.HelixManager;
import org.apache.helix.controller.HelixControllerMain;
import org.apache.helix.taskexecution.Dag.Node;

/**
 * Demo for execution of task Dag using primitives provided by Helix. This demo sets up a Dag of
 * tasks for
 * providing analytics for impression and click events. Each node on the dag has an id and declares
 * the desired parallelism and
 * IDs of the nodes it depends on. When we submit this dag for execution using submitDag()
 * {@link TaskCluster}, we create a Helix resource for
 * each node with number of partitions based on desired parallelism. We use a "OnlineOffline" state
 * model.
 * The demo starts NUM_WORKERS workers. Each worker is given a {@link TaskFactory} and
 * {@link TaskResultStore}. Each worker is assigned a bunch of task
 * partitions by Helix and gets state transition messages for the task partitions it is assigned.
 * When the worker gets a state transition message
 * for a task partition, it first checks if all upstream dependencies are satisfied by making sure
 * that corresponding partitions have transitioned to
 * "Online" state. It then creates a Task object using TaskFactory based on resource name (since
 * task IDs have been mapped to resource names)
 * The demo has the following steps
 * <OL>
 * <LI>Start zookeeper</LI>
 * <LI>Setup task cluster {@link TaskCluster}</LI>
 * <LI>Start Helix controller</LI>
 * <LI>Populate dummy impression and click data</LI>
 * <LI>Start workers</LI>
 * <LI>Submit dag</LI>
 */
public class TaskExecutionDemo {

    private static final int NUM_WORKERS = 10;
    private static final int NUM_IMP_EVENTS = 10000;

    public static void main(String[] args) throws Exception {
        if (args.length != 3) {
            System.err.println("USAGE: java TaskExecutionDemo zkPort redisHost redisPort");
            System.exit(1);
        }

        String redisHost = args[1];
        int redisPort = Integer.parseInt(args[2]);
        ZkServer server = null;
        try {
            String baseDir = "/tmp/TaskExecutionDemo/";
            final String dataDir = baseDir + "zk/dataDir";
            final String logDir = baseDir + "/tmp/logDir";
            FileUtils.deleteDirectory(new File(dataDir));
            FileUtils.deleteDirectory(new File(logDir));

            IDefaultNameSpace defaultNameSpace = new IDefaultNameSpace() {
                @Override
                public void createDefaultNameSpace(ZkClient zkClient) {

                }
            };

            int zkPort = Integer.parseInt(args[0]);
            server = new ZkServer(dataDir, logDir, defaultNameSpace, zkPort);
            server.start();

            String zkAddr = "localhost:" + zkPort;
            String clusterName = TaskCluster.DEFAULT_CLUSTER_NAME;

            TaskCluster taskCluster = new TaskCluster(zkAddr, clusterName);
            taskCluster.setup();

            startController(zkAddr, clusterName);

            TaskFactory taskFactory = new AnalyticsTaskFactory();
            TaskResultStore taskResultStore = new RedisTaskResultStore(redisHost, redisPort, 1000);

            populateDummyData(taskResultStore);

            startWorkers(zkAddr, TaskCluster.DEFAULT_CLUSTER_NAME, taskFactory, taskResultStore);

            Dag dag = getAnalyticsDag();
            taskCluster.submitDag(dag);
        } catch (Exception e) {
            e.printStackTrace();
            throw e;
        } finally {
            if (server != null) {
                // server.shutdown();
            }
        }
    }

    private static void populateDummyData(TaskResultStore taskResultStore) throws Exception {
        float fraudProbability = 0.01f;
        float clickProbability = 0.01f;
        int numImps = NUM_IMP_EVENTS;
        Random rand = new Random();
        String[] countries = { "US", "CANADA", "UK", "CHINA", "UNKNOWN" };
        String[] genders = { "M", "F", "UNKNOWN" };
        for (int i = 0; i < numImps; i++) {
            boolean isFraudulent = (rand.nextFloat() <= fraudProbability);
            String impEventId = "" + Math.abs(rand.nextLong());
            String impEvent = impEventId; // event id
            impEvent += "," + isFraudulent;
            impEvent += "," + countries[rand.nextInt(countries.length)];
            impEvent += "," + genders[rand.nextInt(genders.length)];
            taskResultStore.rpush(FilterTask.IMPRESSIONS, impEvent);

            boolean isClick = (rand.nextFloat() <= clickProbability);
            if (isClick) {
                String clickEvent = "" + Math.abs(rand.nextLong()); // event id
                isFraudulent = (rand.nextFloat() <= fraudProbability);
                clickEvent += "," + isFraudulent;
                clickEvent += "," + impEventId;
                taskResultStore.rpush(FilterTask.CLICKS, clickEvent);
            }
        }
        System.out.println("Done populating dummy data");
    }

    private static void startController(String zkAddr, String clusterName) throws Exception {
        final HelixManager manager = HelixControllerMain.startHelixController(zkAddr, clusterName, null,
                HelixControllerMain.STANDALONE);

        Runtime.getRuntime().addShutdownHook(new Thread() {
            @Override
            public void run() {
                System.out.println("Shutting down cluster manager: " + manager.getInstanceName());
                manager.disconnect();
            }
        });
    }

    private static void startWorkers(String zkAddr, String clusterName, TaskFactory taskFactory,
            TaskResultStore taskResultStore) {
        int numWorkers = NUM_WORKERS;
        Executor executor = Executors.newFixedThreadPool(numWorkers);

        for (int i = 0; i < numWorkers; i++) {
            Worker worker = new Worker(zkAddr, clusterName, "" + i, taskFactory, taskResultStore);
            executor.execute(worker);
        }
    }

    private static Dag getAnalyticsDag() {
        Dag dag = new Dag();
        dag.addNode(new Node("filterImps", 10, ""));
        dag.addNode(new Node("filterClicks", 5, ""));
        dag.addNode(new Node("impClickJoin", 10, "filterImps,filterClicks"));
        dag.addNode(new Node("impCountsByGender", 10, "filterImps"));
        dag.addNode(new Node("impCountsByCountry", 10, "filterImps"));
        dag.addNode(new Node("clickCountsByGender", 5, "impClickJoin"));
        dag.addNode(new Node("clickCountsByCountry", 5, "impClickJoin"));

        dag.addNode(new Node("report", 1,
                "impCountsByGender,impCountsByCountry,clickCountsByGender,clickCountsByCountry"));

        return dag;
    }

}