org.apache.hadoop.distributedloadsimulator.sls.nodemanager.NMSimulator.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.distributedloadsimulator.sls.nodemanager.NMSimulator.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.hadoop.distributedloadsimulator.sls.nodemanager;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.distributedloadsimulator.sls.SLSRunner;

import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.NodeAction;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
import org.apache.hadoop.yarn.server.api.records.NodeStatus;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.Records;

import org.apache.hadoop.distributedloadsimulator.sls.scheduler.ContainerSimulator;
import org.apache.hadoop.distributedloadsimulator.sls.scheduler.TaskRunner;
import org.apache.hadoop.distributedloadsimulator.sls.utils.SLSUtils;
import org.apache.hadoop.yarn.client.ConfiguredLeastLoadedRMFailoverHAProxyProvider;
import org.apache.hadoop.yarn.client.RMFailoverProxyProvider;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.api.ServerRMProxy;

public class NMSimulator extends TaskRunner.Task {

    private ResourceTracker resourceTracker;
    // node resource
    private RMNode node;
    // master key
    private MasterKey masterKey;
    private MasterKey containerMasterKey;
    // containers with various STATE
    private List<ContainerId> completedContainerList;
    private List<ContainerId> releasedContainerList;
    private DelayQueue<ContainerSimulator> containerQueue;
    private Map<ContainerId, ContainerSimulator> runningContainers;
    private List<ContainerId> amContainerList;
    // resource manager
    private ResourceManager rm;
    private AtomicInteger totalHeartBeat = new AtomicInteger(0);
    private AtomicInteger trueHeartBeat = new AtomicInteger(0);
    // heart beat response id
    private int RESPONSE_ID = 0;
    private final static Log LOG = LogFactory.getLog(NMSimulator.class);
    private boolean isFirstBeat = true;
    private AtomicInteger usedResources = new AtomicInteger(0);

    public void init(String nodeIdStr, int memory, int cores, int dispatchTime, int heartBeatInterval,
            ResourceManager rm, Configuration conf) throws IOException, YarnException, ClassNotFoundException {
        super.init(dispatchTime, dispatchTime + 1000000L * heartBeatInterval, heartBeatInterval);
        conf.setClass(YarnConfiguration.LEADER_CLIENT_FAILOVER_PROXY_PROVIDER,
                ConfiguredLeastLoadedRMFailoverHAProxyProvider.class, RMFailoverProxyProvider.class);
        Class<? extends RMFailoverProxyProvider> defaultProviderClass = (Class<? extends RMFailoverProxyProvider>) Class
                .forName(YarnConfiguration.DEFAULT_LEADER_CLIENT_FAILOVER_PROXY_PROVIDER);
        this.resourceTracker = ServerRMProxy.createRMProxy(conf, ResourceTracker.class,
                conf.getBoolean(YarnConfiguration.DISTRIBUTED_RM, YarnConfiguration.DEFAULT_DISTRIBUTED_RM));
        // create resource
        String rackHostName[] = SLSUtils.getRackHostName(nodeIdStr);
        this.node = NodeInfo.newNodeInfo(rackHostName[0], rackHostName[1], BuilderUtils.newResource(memory, cores));
        //this.nodeId = NodeId.newInstance(InetAddress.getLocalHost().getHostName(),port);
        //this.rm = rm;
        // init data structures
        completedContainerList = Collections.synchronizedList(new ArrayList<ContainerId>());
        releasedContainerList = Collections.synchronizedList(new ArrayList<ContainerId>());
        containerQueue = new DelayQueue<ContainerSimulator>();
        amContainerList = Collections.synchronizedList(new ArrayList<ContainerId>());
        runningContainers = new ConcurrentHashMap<ContainerId, ContainerSimulator>();
        // register NM with RM
        RegisterNodeManagerRequest req = Records.newRecord(RegisterNodeManagerRequest.class);
        req.setNodeId(node.getNodeID());
        req.setResource(node.getTotalCapability());
        req.setHttpPort(80);
        LOG.info("send registration request " + node.getNodeID());
        RegisterNodeManagerResponse response = resourceTracker.registerNodeManager(req);
        LOG.info("registration done " + node.getNodeID());
        masterKey = response.getNMTokenMasterKey();
        containerMasterKey = response.getContainerTokenMasterKey();
    }

    @Override
    public void firstStep() throws YarnException, IOException {
        // do nothing
    }

    public int getTotalHeartBeat() {
        return totalHeartBeat.get();
    }

    public int getTotalTrueHeartBeat() {
        return trueHeartBeat.get();
    }

    static AtomicLong hbduration = new AtomicLong(0);
    static AtomicInteger nbhb = new AtomicInteger(0);

    private long startXP = 0;
    private int nbHb = 0;
    private int nbTrueHb = 0;
    private long last = 0;
    private long totalInterval = 0;
    private long totalHBDuration = 0;

    @Override
    public void middleStep() throws YarnException, IOException {
        if (startXP == 0) {
            startXP = System.currentTimeMillis();
            last = System.currentTimeMillis();
        }
        // we check the lifetime for each running containers
        ContainerSimulator cs = null;
        synchronized (completedContainerList) {
            while ((cs = containerQueue.poll()) != null) {
                runningContainers.remove(cs.getId());
                completedContainerList.add(cs.getId());
                usedResources.decrementAndGet();
            }
        }

        // send heart beat
        NodeHeartbeatRequest beatRequest = Records.newRecord(NodeHeartbeatRequest.class);
        beatRequest.setLastKnownNMTokenMasterKey(masterKey);
        beatRequest.setLastKnownContainerTokenMasterKey(containerMasterKey);
        NodeStatus ns = Records.newRecord(NodeStatus.class);

        ns.setContainersStatuses(generateContainerStatusList());
        ns.setNodeId(node.getNodeID());
        ns.setKeepAliveApplications(new ArrayList<ApplicationId>());
        ns.setResponseId(RESPONSE_ID);
        ns.setNodeHealthStatus(NodeHealthStatus.newInstance(true, "", System.currentTimeMillis()));
        beatRequest.setNodeStatus(ns);
        // only first time , this NM thread will update the beat start sec
        if (isFirstBeat) {
            SLSRunner.measureFirstBeat();
            isFirstBeat = false;
        }
        long start = System.currentTimeMillis();
        try {
            NodeHeartbeatResponse beatResponse = resourceTracker.nodeHeartbeat(beatRequest);
            long duration = System.currentTimeMillis() - start;
            long totalDuration = hbduration.addAndGet(duration);
            int totalNbHb = nbhb.incrementAndGet();
            long avg = totalDuration / totalNbHb;
            nbHb++;
            long theoric = (System.currentTimeMillis() - startXP) / 1000;
            float percentHb = (float) nbHb / theoric;
            long dif = nbHb - theoric;
            long interval = System.currentTimeMillis() - last;
            totalHBDuration += duration;
            totalInterval += interval;
            long avgHBDuration = totalHBDuration / nbHb;
            long avgInterval = totalInterval / nbHb;
            //      if(percentHb<0.97){
            //        LOG.error("this node is running behind: " + node.getNodeID() + " : " + percentHb);
            //      }

            if (interval > 1500) {
                LOG.debug("this hb was too slow: " + node.getNodeID() + " : " + RESPONSE_ID + " " + interval + " "
                        + duration);
            }
            last = System.currentTimeMillis();
            if (duration > 1000) {
                LOG.error(node.getNodeID() + " hb duration: " + duration + " avg: " + avg);
            }
            totalHeartBeat.incrementAndGet();
            if (beatResponse.getNextheartbeat()) {
                trueHeartBeat.incrementAndGet();
                nbTrueHb++;
            }
            float percentTrueHb = (float) nbTrueHb / theoric;
            LOG.debug("percent hb " + node.getNodeID() + " " + percentHb + ", truehb " + percentTrueHb);
            if (!beatResponse.getContainersToCleanup().isEmpty()) {
                // remove from queue
                synchronized (releasedContainerList) {
                    for (ContainerId containerId : beatResponse.getContainersToCleanup()) {
                        if (amContainerList.contains(containerId)) {
                            // AM container (not killed?, only release)
                            synchronized (amContainerList) {
                                amContainerList.remove(containerId);
                            }
                        } else {
                            cs = runningContainers.remove(containerId);
                            usedResources.decrementAndGet();
                            if (cs != null) {
                                LOG.error("in the simulated scenario the container should not be"
                                        + "freed until they have completed their task");
                                throw new YarnException("the failover should be transparent");
                            }
                            containerQueue.remove(cs);
                            releasedContainerList.add(containerId);
                        }
                    }
                }
            }
            if (beatResponse.getNodeAction() == NodeAction.SHUTDOWN) {
                LOG.error("we should not get a sutdown: " + node.getNodeID() + " message: "
                        + beatResponse.getDiagnosticsMessage());
                throw new YarnException("there should be no shutdown " + beatResponse.getDiagnosticsMessage());
                //      lastStep();
            } else if (beatResponse.getNodeAction() == NodeAction.RESYNC) {
                LOG.error("the failover should be transparent, node: " + node.getNodeID() + " message: "
                        + beatResponse.getDiagnosticsMessage());
                throw new YarnException(
                        "the failover should be transparent " + beatResponse.getDiagnosticsMessage());
            }
            if (beatResponse.getContainerTokenMasterKey() != null) {
                masterKey = beatResponse.getContainerTokenMasterKey();
            }
            if (beatResponse.getContainerTokenMasterKey() != null) {
                containerMasterKey = beatResponse.getContainerTokenMasterKey();
            }
            RESPONSE_ID = beatResponse.getResponseId();
        } catch (Exception ex) {
            if (ex instanceof InterruptedException) {
                LOG.warn(ex, ex);
            } else if (ex instanceof IOException) {
                throw (IOException) ex;
            } else {
                LOG.error(ex, ex);
                throw new YarnException(ex);
            }
        }
    }

    @Override
    public void lastStep() {
        LOG.info("Last step for nodemanager " + node.getNodeID());
    }

    /**
     * catch status of all containers located on current node
     */
    private ArrayList<ContainerStatus> generateContainerStatusList() {
        ArrayList<ContainerStatus> csList = new ArrayList<ContainerStatus>();
        // add running containers
        for (ContainerSimulator container : runningContainers.values()) {
            csList.add(newContainerStatus(container.getId(), ContainerState.RUNNING, ContainerExitStatus.SUCCESS));
        }
        synchronized (amContainerList) {
            for (ContainerId cId : amContainerList) {
                csList.add(newContainerStatus(cId, ContainerState.RUNNING, ContainerExitStatus.SUCCESS));
            }
        }
        int completed = 0;
        // add complete containers
        synchronized (completedContainerList) {
            for (ContainerId cId : completedContainerList) {
                csList.add(newContainerStatus(cId, ContainerState.COMPLETE, ContainerExitStatus.SUCCESS));
            }
            completed += completedContainerList.size();
            completedContainerList.clear();
        }
        // released containers
        synchronized (releasedContainerList) {
            for (ContainerId cId : releasedContainerList) {
                csList.add(newContainerStatus(cId, ContainerState.COMPLETE, ContainerExitStatus.ABORTED));
            }
            completed += releasedContainerList.size();
            if (releasedContainerList.size() > 0) {
                LOG.debug(node.getNodeID() + "aborted containers " + releasedContainerList.size());
            }
            releasedContainerList.clear();
            if (completed > 0) {
                LOG.debug(node.getNodeID() + " completed containers " + completed);
            }
        }
        return csList;
    }

    private ContainerStatus newContainerStatus(ContainerId cId, ContainerState state, int exitState) {
        ContainerStatus cs = Records.newRecord(ContainerStatus.class);
        cs.setContainerId(cId);
        cs.setState(state);
        cs.setExitStatus(exitState);
        return cs;
    }

    public RMNode getNode() {
        return node;
    }

    /**
     * launch a new container with the given life time
     *
     * @param container
     * @param lifeTimeMS
     */
    public void addNewContainer(Container container, long lifeTimeMS) {
        if (lifeTimeMS != -1) {
            // normal container
            ContainerSimulator cs = new ContainerSimulator(container.getId(), container.getResource(),
                    lifeTimeMS + System.currentTimeMillis(), lifeTimeMS);
            containerQueue.add(cs);
            runningContainers.put(cs.getId(), cs);
            usedResources.incrementAndGet();
        } else {
            // AM container
            // -1 means AMContainer
            synchronized (amContainerList) {
                amContainerList.add(container.getId());
            }
        }
    }

    /**
     * clean up an AM container and add to completed list
     *
     * @param containerId id of the container to be cleaned
     */
    public void cleanupContainer(ContainerId containerId) {
        synchronized (amContainerList) {
            amContainerList.remove(containerId);
        }
        synchronized (completedContainerList) {
            completedContainerList.add(containerId);
        }
    }

    public int getUsedResources() {
        return usedResources.get();
    }
}