hws.core.JobMaster.java Source code

Java tutorial

Introduction

Here is the source code for hws.core.JobMaster.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package hws.core;

import java.io.File;
import java.io.PrintWriter;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;

import java.util.Collections;
import java.util.Deque;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;

import org.I0Itec.zkclient.ZkClient;
import org.I0Itec.zkclient.IZkChildListener;
import org.I0Itec.zkclient.exception.ZkInterruptedException;

import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.StringUtils;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.LocatedFileStatus;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
import org.apache.hadoop.yarn.api.records.*;
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.NMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.util.Records;
import org.apache.hadoop.yarn.util.Apps;
import org.apache.hadoop.yarn.util.ConverterUtils;

import hws.core.info.ModuleInfo;
import hws.core.info.FilterInfo;
import hws.core.info.ChannelInfo;
import hws.core.info.StubInfo;
import hws.core.info.InstanceInfo;
import hws.core.info.ModulePipeline;

import hws.util.Json;
import hws.util.Logger;

/**
 * This class implements a simple async app master.
 * In real usages, the callbacks should execute in a separate thread or thread pool
 */
public class JobMaster implements AMRMClientAsync.CallbackHandler {
    private Configuration configuration;
    private NMClient nmClient;
    private String appIdStr;
    private ModulePipeline modulePipeline;
    private Map<String, InstanceInfo.Builder> instances;
    private Map<String, IZkChildListener> finishListeners;
    private Map<String, List<String>> haltedProducers;

    private String zksArgs;
    private String[] zkServers;
    private ZkClient zk;
    //TODO 
    private int numContainersToWaitFor;
    private int currentModuleIndex = 0;

    public JobMaster(ModulePipeline modulePipeline, String appIdStr, String zksArgs, String[] zkServers) {
        this.numContainersToWaitFor = 0; //TODO remove

        configuration = new YarnConfiguration();
        this.appIdStr = appIdStr;
        this.modulePipeline = modulePipeline;
        nmClient = NMClient.createNMClient();
        nmClient.init(configuration);
        nmClient.start();
        this.instances = this.modulePipeline.instances();
        this.finishListeners = new ConcurrentHashMap<String, IZkChildListener>();
        this.haltedProducers = new ConcurrentHashMap<String, List<String>>();
        this.zksArgs = zksArgs;
        this.zkServers = zkServers;
        //Logger setup
        try {
            FSDataOutputStream writer = FileSystem.get(configuration)
                    .create(new Path("hdfs:///hws/apps/" + appIdStr + "/logs/jobMaster.log"));
            Logger.addOutputStream(writer);
        } catch (IOException e) {
            //e.printStackTrace();
        }

        zk = new ZkClient(zkServers[0]); //TODO choose the ZooKeeper server
    }

    public void onContainersAllocated(List<Container> containers) {
        FileSystem fs = null;
        try {
            fs = FileSystem.get(getConfiguration());
        } catch (IOException e) {
            Logger.severe(e.toString());
        }
        for (Container container : containers) {
            try {
                //PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter("/home/yarn/rcor/yarn/app-master-log.out")));
                Logger.info("Selecting instance to container: " + container.getId().toString());
                //dado o container, escolher a instancia que tem dado de entrada mais perto daquele container
                InstanceInfo instanceInfo = null;
                if (instances.get(modulePipeline.get(currentModuleIndex).filterInfo().name())
                        .instancesBuilt() >= modulePipeline.get(currentModuleIndex).numFilterInstances()) {
                    currentModuleIndex++;
                }
                if (currentModuleIndex < modulePipeline.size()) {
                    instanceInfo = instances.get(modulePipeline.get(currentModuleIndex).filterInfo().name())
                            .build();
                } else
                    break;

                String instanceInfoBase64 = Base64
                        .encodeBase64String(StringUtils.getBytesUtf8(Json.dumps(instanceInfo)))
                        .replaceAll("\\s", "");
                // Launch container by create ContainerLaunchContext
                ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class);
                ctx.setCommands(Collections.singletonList(
                        "$JAVA_HOME/bin/java -Xmx256M hws.core.InstanceDriver --load " + instanceInfoBase64
                                + " -aid " + this.appIdStr + " -cid " + container.getId().toString() + " "
                                + this.zksArgs + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
                                + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"));

                Logger.info("Listing YARN-Watershed files for app-id: " + this.appIdStr);
                RemoteIterator<LocatedFileStatus> files = fs.listFiles(new Path("hdfs:///hws/bin/"), false);
                Map<String, LocalResource> resources = new HashMap<String, LocalResource>();
                Logger.info("Setup YARN-Watershed files as resources");
                while (files.hasNext()) {
                    LocatedFileStatus fileStatus = files.next();
                    // Setup jar for ApplicationMaster
                    LocalResource containerJar = Records.newRecord(LocalResource.class);
                    ContainerUtils.setupContainerJar(fs, fileStatus.getPath(), containerJar);
                    resources.put(fileStatus.getPath().getName(), containerJar);
                }

                Logger.info("Listing application files for app-id: " + this.appIdStr);
                files = fs.listFiles(new Path("hdfs:///hws/apps/" + this.appIdStr + "/"), false);
                Logger.info("Setup application files as resources");
                while (files.hasNext()) {
                    LocatedFileStatus fileStatus = files.next();
                    // Setup jar for ApplicationMaster
                    LocalResource containerJar = Records.newRecord(LocalResource.class);
                    ContainerUtils.setupContainerJar(fs, fileStatus.getPath(), containerJar);
                    resources.put(fileStatus.getPath().getName(), containerJar);
                }
                Logger.info("container resource setup");
                ctx.setLocalResources(resources);

                Logger.info("Environment setup");
                // Setup CLASSPATH for ApplicationMaster
                Map<String, String> containerEnv = new HashMap<String, String>();
                ContainerUtils.setupContainerEnv(containerEnv, getConfiguration());
                ctx.setEnvironment(containerEnv);
                Logger.info("Starting containers");

                Logger.info("[AM] Launching container " + container.getId());
                nmClient.startContainer(container, ctx);
                Logger.info("Container started!");
                /*String znode = "/hadoop-watershed/"+this.appIdStr+"/"+instanceInfo.filterInfo().name()+"/"+instanceInfo.instanceId();
                out.println("Saving instance znode: "+znode);
                out.flush();
                zk.createPersistent(znode, "");
                zk.createPersistent(znode+"/host", container.getNodeId().getHost());
                out.println("saved location: "+container.getNodeId().getHost());
                out.flush();
                */
                if (instances.get(modulePipeline.get(currentModuleIndex).filterInfo().name())
                        .instancesBuilt() >= modulePipeline.get(currentModuleIndex).numFilterInstances()) {
                    Logger.info("Starting via ZooKeeper filter: " + instanceInfo.filterInfo().name());
                    zk.createPersistent("/hadoop-watershed/" + this.appIdStr + "/"
                            + instanceInfo.filterInfo().name() + "/start", "");
                }
                //out.close();
            } catch (Exception e) {
                Logger.severe("[AM] Error launching container " + container.getId() + " " + e);
            }
        }
        try {
            fs.close();
        } catch (IOException e) {
            Logger.severe(e.toString());
        }
    }

    public void onContainersCompleted(List<ContainerStatus> statuses) {
        for (ContainerStatus status : statuses) {
            Logger.info("[AM] Completed container " + status.getContainerId());
            synchronized (this) {
                numContainersToWaitFor--;
            }
        }
    }

    public void onNodesUpdated(List<NodeReport> updated) {
    }

    public void onReboot() {
    }

    public void onShutdownRequest() {
    }

    public void onError(Throwable t) {
    }

    public float getProgress() {
        return 0;
    }

    public boolean doneWithContainers() {
        return numContainersToWaitFor == 0;
    }

    public Configuration getConfiguration() {
        return configuration;
    }

    public static void main(String[] args) throws Exception {
        Options options = new Options();
        options.addOption(OptionBuilder.withLongOpt("app-id").withDescription("String of the Application Id ")
                .hasArg().withArgName("AppId").create("aid"));
        options.addOption(OptionBuilder.withLongOpt("load").withDescription("load module pipeline").hasArg()
                .withArgName("Json-Base64").create());
        options.addOption(OptionBuilder.withLongOpt("remove").withDescription("remove modules").hasArgs()
                .withArgName("ModuleNames").create("rm"));
        options.addOption(OptionBuilder.withLongOpt("zk-servers").withDescription("List of the ZooKeeper servers")
                .hasArgs().withArgName("zkAddrs").create("zks"));
        CommandLineParser parser = new BasicParser();
        CommandLine cmd = parser.parse(options, args);

        String appIdStr = null;
        String modulePipelineBase64 = null;
        String modulePipelineJson = null;
        ModulePipeline modulePipeline = null;
        String[] moduleNames = null;
        if (cmd.hasOption("aid")) {
            appIdStr = cmd.getOptionValue("aid");
        }
        String zksArgs = "";
        String[] zkServers = null;
        if (cmd.hasOption("zks")) {
            zksArgs = "-zks";
            zkServers = cmd.getOptionValues("zks");
            for (String zks : zkServers) {
                zksArgs += " " + zks;
            }
        }
        if (cmd.hasOption("load")) {
            modulePipelineBase64 = cmd.getOptionValue("load");
            modulePipelineJson = StringUtils.newStringUtf8(Base64.decodeBase64(modulePipelineBase64));
            modulePipeline = Json.loads(modulePipelineJson, ModulePipeline.class);
        } else if (cmd.hasOption("rm")) {
            moduleNames = cmd.getOptionValues("rm");
        }

        JobMaster master = new JobMaster(modulePipeline, appIdStr, zksArgs, zkServers);

        if (modulePipelineJson != null) {
            Logger.info("Module Pipeline: " + modulePipelineJson);
            Logger.info("Instances: " + Json.dumps(modulePipeline.instances()));
        }
        master.runMainLoop();
    }

    public IZkChildListener createFinishListener(final String filterName, final int numFilterInstances,
            final CountDownLatch doneLatch) {
        IZkChildListener childListener = new IZkChildListener() {
            private String _filterName = filterName;
            private int _numFilterInstances = numFilterInstances;
            private CountDownLatch _doneLatch = doneLatch;

            public void handleChildChange(String parentPath, List<String> currentChilds) throws Exception {
                if (currentChilds.size() == _numFilterInstances) {
                    //TODO if all producers for a input port have halted, send a signal znode(/hadoop-watershed/appId/consumerFilter/halted/channelName) to all consumers
                    try {
                        Logger.info("Halting producer filter " + _filterName);
                        Logger.info(parentPath);
                        String str = "";
                        for (String child : currentChilds) {
                            str += child + ";";
                        }
                        Logger.info(str);

                        for (String channelName : modulePipeline.get(_filterName).outputChannels().keySet()) {
                            if (!haltedProducers.containsKey(channelName)) {
                                haltedProducers.put(channelName, new ArrayList<String>());
                            }
                            Logger.info("Halting producer for channel: " + channelName);
                            haltedProducers.get(channelName).add(_filterName);
                            Logger.info("Halted producers: " + Json.dumps(haltedProducers));
                            if (modulePipeline.outputBindings() == null) {
                                Logger.info("outputbindings null");
                            }
                            if (modulePipeline.outputBindings().get(channelName) == null) {
                                Logger.info("outputbindings channel null");
                            }
                            Logger.info("Output bindings size: "
                                    + modulePipeline.outputBindings().get(channelName).size());
                            Logger.info("Output bindings: "
                                    + Json.dumps(modulePipeline.outputBindings().get(channelName)));
                            if (haltedProducers.get(channelName).size() == modulePipeline.outputBindings()
                                    .get(channelName).size()) {
                                Logger.info("All producers halted");
                                if (modulePipeline.inputBindings() == null) {
                                    Logger.info("inputbindings null");
                                }
                                if (modulePipeline.inputBindings().get(channelName) == null) {
                                    Logger.info("inputbindings channel null");
                                }
                                if (modulePipeline.inputBindings().get(channelName) != null) {
                                    for (String consumerName : modulePipeline.inputBindings().get(channelName)) {
                                        Logger.info("Sending signal for consumer: " + consumerName);
                                        zk.createPersistent("/hadoop-watershed/" + appIdStr + "/" + consumerName
                                                + "/halted/" + channelName, "");
                                    }
                                }
                            }
                        }
                        Logger.info("Producer halted: " + _filterName);
                        _doneLatch.countDown();
                    } catch (Exception e) {
                        Logger.severe(e.toString());
                    }
                }
            }
        };
        this.finishListeners.put(filterName, childListener);
        return childListener;
    }

    public void runMainLoop() throws Exception {

        AMRMClientAsync<ContainerRequest> rmClient = AMRMClientAsync.createAMRMClientAsync(100, this);
        rmClient.init(getConfiguration());
        rmClient.start();

        // Register with ResourceManager
        Logger.info("[AM] registerApplicationMaster 0");
        rmClient.registerApplicationMaster("", 0, "");
        Logger.info("[AM] registerApplicationMaster 1");

        // Priority for worker containers - priorities are intra-application
        Priority priority = Records.newRecord(Priority.class);
        priority.setPriority(0);

        // Resource requirements for worker containers
        Resource capability = Records.newRecord(Resource.class);
        capability.setMemory(128);
        capability.setVirtualCores(1);

        final CountDownLatch doneLatch = new CountDownLatch(this.modulePipeline.size());
        // Make container requests to ResourceManager
        for (ModuleInfo moduleInfo : this.modulePipeline) { //create containers for each instance of each module
            zk.createPersistent("/hadoop-watershed/" + this.appIdStr + "/" + moduleInfo.filterInfo().name(), "");
            zk.createPersistent(
                    "/hadoop-watershed/" + this.appIdStr + "/" + moduleInfo.filterInfo().name() + "/finish", "");
            zk.createPersistent(
                    "/hadoop-watershed/" + this.appIdStr + "/" + moduleInfo.filterInfo().name() + "/halted", "");
            zk.subscribeChildChanges(
                    "/hadoop-watershed/" + this.appIdStr + "/" + moduleInfo.filterInfo().name() + "/finish",
                    createFinishListener(moduleInfo.filterInfo().name(), moduleInfo.numFilterInstances(),
                            doneLatch));
            for (int i = 0; i < moduleInfo.numFilterInstances(); i++) {
                this.numContainersToWaitFor++;
                ContainerRequest containerAsk = new ContainerRequest(capability, null, null, priority);
                Logger.info("[AM] Making res-req for " + moduleInfo.filterInfo().name() + " " + i);
                rmClient.addContainerRequest(containerAsk);
            }
        }
        //TODO: process for starting the whole application
        //create containers
        // -> create instances
        // -> start output channels and filters
        // -> start input channels in reversed topological order (considering that there is no cycle)
        //    * if there is cycle, then inicially start in any order
        //TODO "send" the start signal via ZooKeeper

        Logger.info("[AM] waiting for containers to finish");
        try {
            doneLatch.await(); //await the input threads to finish
        } catch (InterruptedException e) {
            Logger.fatal(e.toString());
            //e.printStackTrace();
        }
        /*while(!doneWithContainers()) {
        Thread.sleep(50);
        }*/

        zk.createPersistent("/hadoop-watershed/" + appIdStr + "/done", "");

        Logger.info("[AM] unregisterApplicationMaster 0");
        // Un-register with ResourceManager
        rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, "", "");
        Logger.info("[AM] unregisterApplicationMaster 1");
    }
}