org.openspaces.grid.gsm.containers.DefaultContainersSlaEnforcementEndpoint.java Source code

Java tutorial

Introduction

Here is the source code for org.openspaces.grid.gsm.containers.DefaultContainersSlaEnforcementEndpoint.java

Source

/*******************************************************************************
 * 
 * Copyright (c) 2012 GigaSpaces Technologies Ltd. All rights reserved
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *  
 ******************************************************************************/
package org.openspaces.grid.gsm.containers;

import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.openspaces.admin.Admin;
import org.openspaces.admin.AdminException;
import org.openspaces.admin.gsa.GridServiceAgent;
import org.openspaces.admin.gsa.GridServiceAgents;
import org.openspaces.admin.gsc.GridServiceContainer;
import org.openspaces.admin.internal.admin.InternalAdmin;
import org.openspaces.admin.internal.gsa.InternalGridServiceAgent;
import org.openspaces.admin.internal.gsc.InternalGridServiceContainer;
import org.openspaces.admin.internal.support.InternalAgentGridComponent;
import org.openspaces.admin.pu.ProcessingUnit;
import org.openspaces.grid.esm.EsmSystemProperties;
import org.openspaces.grid.gsm.LogPerProcessingUnit;
import org.openspaces.grid.gsm.SingleThreadedPollingLog;
import org.openspaces.grid.gsm.capacity.CapacityRequirements;
import org.openspaces.grid.gsm.capacity.CapacityRequirementsPerAgent;
import org.openspaces.grid.gsm.capacity.MemoryCapacityRequirement;
import org.openspaces.grid.gsm.containers.exceptions.ContainerNotDiscoveredException;
import org.openspaces.grid.gsm.containers.exceptions.ContainersSlaEnforcementInProgressException;
import org.openspaces.grid.gsm.containers.exceptions.ContainersSlaEnforcementPendingProcessingUnitDeallocationException;
import org.openspaces.grid.gsm.containers.exceptions.FailedToStartNewGridServiceContainersException;

import com.gigaspaces.grid.gsa.AgentProcessDetails;

class DefaultContainersSlaEnforcementEndpoint implements ContainersSlaEnforcementEndpoint {

    // enough time for the GSC also to register with the Lookup Service.
    private static final long START_CONTAINER_TIMEOUT_FAILURE_SECONDS = Long.getLong(
            EsmSystemProperties.ESM_START_CONTAINER_TIMEOUT_FAILURE_SECONDS,
            EsmSystemProperties.ESM_START_CONTAINER_TIMEOUT_FAILURE_SECONDS_DEFAULT);
    private static final long START_CONTAINER_TIMEOUT_FAILURE_FORGET_SECONDS = START_CONTAINER_TIMEOUT_FAILURE_SECONDS
            + Long.getLong(EsmSystemProperties.ESM_WAIT_BEFORE_START_CONTAINER_AGAIN_SECONDS,
                    EsmSystemProperties.ESM_WAIT_BEFORE_START_CONTAINER_AGAIN_SECONDS_DEFAULT);

    private final ProcessingUnit pu;
    private final Log logger;
    private ContainersSlaEnforcementState state;

    public DefaultContainersSlaEnforcementEndpoint(ProcessingUnit pu, ContainersSlaEnforcementState state) {
        this.pu = pu;
        this.logger = new LogPerProcessingUnit(
                new SingleThreadedPollingLog(LogFactory.getLog(DefaultContainersSlaEnforcementEndpoint.class)), pu);
        this.state = state;
    }

    @Override
    public GridServiceContainer[] getContainers() {
        validateEndpointNotDestroyed(pu);

        Collection<GridServiceContainer> approvedContainers = ContainersSlaUtils.getContainersByZone(pu.getAdmin(),
                ContainersSlaUtils.getContainerZone(pu));
        approvedContainers.removeAll(state.getContainersMarkedForDeallocation(pu));

        return approvedContainers.toArray(new GridServiceContainer[approvedContainers.size()]);
    }

    public boolean isContainersPendingDeallocation() throws ContainersSlaEnforcementInProgressException {
        validateEndpointNotDestroyed(pu);
        return !state.getContainersMarkedForDeallocation(pu).isEmpty();
    }

    @Override
    public void enforceSla(ContainersSlaPolicy sla) throws ContainersSlaEnforcementInProgressException {

        validateEndpointNotDestroyed(pu);

        validateSla(sla, pu);

        checkAllUndiscoveredContainersAreNotRunning(sla);

        enforceSlaInternal(sla);
    }

    private static void validateSla(ContainersSlaPolicy sla, ProcessingUnit pu) {
        if (sla == null) {
            throw new IllegalArgumentException("sla cannot be null");
        }

        sla.validate();

        final String[] zoneInContainerOptions = sla.getNewContainerConfig().getZones();

        final String zone = ContainersSlaUtils.getContainerZone(pu);
        if (zoneInContainerOptions.length != 1 || !zoneInContainerOptions[0].equals(zone)) {
            throw new IllegalArgumentException(
                    "sla zone is " + Arrays.toString(zoneInContainerOptions) + " and instead it should be " + zone);
        }
    }

    public ProcessingUnit getProcessingUnit() {
        return pu;
    }

    private void enforceSlaInternal(final ContainersSlaPolicy sla)
            throws ContainersSlaEnforcementInProgressException {

        cleanContainersMarkedForShutdown(pu);
        cleanFutureContainers(sla);

        markForDeallocationContainersOnUnallocatedMachines(sla);
        markForDeallocationContainersOnMachineWithAllocatedCapacityShortage(sla);
        startContainersOnMachineWithAllocatedCapacitySurplus(sla);

        if (state.getNumberOfFutureContainers(pu) > 0) {
            // this will cause the rebalancing enforcement to not be invoked
            // this is good since we don't to relocate instances to existing containers if containers are still starting.
            // because for the most part, new containers are always a better choice for re-allocation.
            throw new ContainersSlaEnforcementInProgressException(pu, "Containers still being started.");
        }

        if (!state.getContainersMarkedForDeallocation(pu).isEmpty()) {
            throw new ContainersSlaEnforcementPendingProcessingUnitDeallocationException(getProcessingUnit(),
                    state.getContainersMarkedForDeallocation(pu));
        }

    }

    private void markForDeallocationContainersOnUnallocatedMachines(final ContainersSlaPolicy sla) {
        final Collection<String> allocatedAgentUids = sla.getClusterCapacityRequirements().getAgentUids();
        final String zone = ContainersSlaUtils.getContainerZone(pu);
        for (final GridServiceContainer container : ContainersSlaUtils.getContainersByZone(pu.getAdmin(), zone)) {
            if (!allocatedAgentUids.contains(container.getGridServiceAgent().getUid())) {
                if (logger.isInfoEnabled()) {
                    logger.info("Grid Service Container " + ContainersSlaUtils.gscToString(container) + " "
                            + "is marked for shutdown since there is no allocation for pu " + pu.getName()
                            + " on this machine. " + "Machine is currently running "
                            + ContainersSlaUtils.gscsToString(
                                    container.getMachine().getGridServiceContainers().getContainers()));
                }
                state.markContainerForDeallocation(pu, container);
            }
        }
    }

    private void markForDeallocationContainersOnMachineWithAllocatedCapacityShortage(
            final ContainersSlaPolicy sla) {
        final Collection<String> allocatedAgentUids = sla.getClusterCapacityRequirements().getAgentUids();
        final String zone = ContainersSlaUtils.getContainerZone(pu);
        // mark for deallocation all containers that do not fit to the allocated memory on agent
        final Collection<GridServiceContainer> containersMarkedForDeallocation = state
                .getContainersMarkedForDeallocation(pu);
        for (final String agentUid : allocatedAgentUids) {
            final long allocatedMemory = getMemoryInMB(
                    sla.getClusterCapacityRequirements().getAgentCapacity(agentUid));
            long remainingAllocatedMemory = allocatedMemory;
            List<GridServiceContainer> containersByZoneOnAgent = ContainersSlaUtils
                    .getContainersByZoneOnAgentUid(pu.getAdmin(), zone, agentUid);
            final long containerMemoryInMB = sla.getNewContainerConfig().getMaximumMemoryCapacityInMB();
            for (final GridServiceContainer container : containersByZoneOnAgent) {

                if (!containersMarkedForDeallocation.contains(container)) {

                    if (remainingAllocatedMemory >= containerMemoryInMB) {
                        logger.debug("Grid Service Container " + ContainersSlaUtils.gscToString(container) + " "
                                + "is running and allocated for pu " + pu.getName());
                        remainingAllocatedMemory -= containerMemoryInMB;
                    } else {
                        if (logger.isInfoEnabled()) {
                            logger.info("Grid Service Container " + ContainersSlaUtils.gscToString(container) + " "
                                    + "is marked for shutdown since there is not enough memory allocated for pu "
                                    + pu.getName() + " " + "on this machine. " + "Allocated memory "
                                    + allocatedMemory + " " + "Containers on machine in zone " + zone + " "
                                    + ContainersSlaUtils.gscsToString(containersByZoneOnAgent) + " "
                                    + "All container on machine "
                                    + ContainersSlaUtils.gscsToString(
                                            container.getMachine().getGridServiceContainers().getContainers())
                                    + "Cluster allocated capacity: "
                                    + sla.getClusterCapacityRequirements().toDetailedString() + " "
                                    + "Container memory in MB: " + containerMemoryInMB);
                        }
                        state.markContainerForDeallocation(pu, container);
                    }
                }
            }
        }
    }

    /**
     * Looks for containers that should have been discovered since they are managed by the GSA
     * or containers that should have been removed since they are no longer managed by the GSA
     * @throws ContainerNotDiscoveredException 
     */
    private void checkAllUndiscoveredContainersAreNotRunning(final ContainersSlaPolicy sla)
            throws ContainerNotDiscoveredException {
        CapacityRequirementsPerAgent requirements = sla.getClusterCapacityRequirements();
        final Collection<String> allocatedAgentUids = requirements.getAgentUids();
        final String zone = ContainersSlaUtils.getContainerZone(pu);
        Admin admin = pu.getAdmin();
        for (String agentUid : allocatedAgentUids) {

            InternalGridServiceAgent agent = (InternalGridServiceAgent) admin.getGridServiceAgents()
                    .getAgentByUID(agentUid);
            if (agent == null) {
                throw new IllegalStateException("agent " + agentUid + " is not discovered");
            }

            for (InternalAgentGridComponent component : agent.getUnconfirmedRemovedAgentGridComponents()) {
                if (component instanceof GridServiceContainer) {
                    GridServiceContainer container = (GridServiceContainer) component;
                    if (ContainersSlaUtils.isContainerMatchesZone(container, zone)) {
                        ContainerNotDiscoveredException exception = new ContainerNotDiscoveredException(
                                getProcessingUnit(), container);
                        if (logger.isDebugEnabled()) {
                            logger.debug("Admin API undiscovered container validation failed", exception);
                        }
                        throw exception;
                    }
                }
            }
        }
    }

    private void startContainersOnMachineWithAllocatedCapacitySurplus(final ContainersSlaPolicy sla) {
        final String zone = ContainersSlaUtils.getContainerZone(pu);
        final Collection<String> allocatedAgentUids = sla.getClusterCapacityRequirements().getAgentUids();
        Collection<GridServiceContainer> containersMarkedForDeallocation = state
                .getContainersMarkedForDeallocation(pu);
        Collection<FutureGridServiceContainer> futureContainers = state.getFutureContainers(pu);
        GridServiceAgents agents = pu.getAdmin().getGridServiceAgents();
        for (String agentUid : allocatedAgentUids) {

            long allocatedMemory = getMemoryInMB(sla.getClusterCapacityRequirements().getAgentCapacity(agentUid));
            final long containerMemoryInMB = sla.getNewContainerConfig().getMaximumMemoryCapacityInMB();
            int numberOfRunningContainers = 0;
            for (GridServiceContainer container : ContainersSlaUtils.getContainersByZoneOnAgentUid(pu.getAdmin(),
                    zone, agentUid)) {

                if (!containersMarkedForDeallocation.contains(container)) {
                    numberOfRunningContainers++;
                }
            }
            int numberOfFutureContainers = 0;
            for (FutureGridServiceContainer futureContainer : futureContainers) {
                if (futureContainer.getGridServiceAgent().getUid().equals(agentUid)) {
                    numberOfFutureContainers++;
                }
            }

            GridServiceAgent agent = agents.getAgentByUID(agentUid);
            if (agent == null) {
                throw new IllegalStateException("agent " + agentUid + " is not discovered");
            }
            int numberOfContainersToStart = (int) Math.ceil(1.0 * allocatedMemory / containerMemoryInMB)
                    - numberOfRunningContainers - numberOfFutureContainers;
            if (numberOfContainersToStart > 0) {

                if (logger.isInfoEnabled()) {
                    logger.info("Starting " + numberOfContainersToStart + " containers on machine"
                            + ContainersSlaUtils.machineToString(agent.getMachine())
                            + "= ceil(allocatedMemory/containerMemory) - runningContainers - futureContainers ="
                            + "ceil(" + allocatedMemory + "/" + containerMemoryInMB + ") - "
                            + numberOfRunningContainers + " - " + numberOfFutureContainers + "= "
                            + numberOfContainersToStart);
                }

                for (int i = 0; i < numberOfContainersToStart; i++) {
                    startContainer(sla, agent);
                }
            }
        }
    }

    private long getMemoryInMB(CapacityRequirements capacityRequirements) {
        return capacityRequirements.getRequirement(new MemoryCapacityRequirement().getType()).getMemoryInMB();
    }

    private void startContainer(final ContainersSlaPolicy sla, final GridServiceAgent gsa) {
        state.addFutureContainer(pu,
                ContainersSlaUtils.startGridServiceContainerAsync((InternalAdmin) pu.getAdmin(),
                        (InternalGridServiceAgent) gsa, sla.getNewContainerConfig(), logger,
                        START_CONTAINER_TIMEOUT_FAILURE_SECONDS, TimeUnit.SECONDS));
    }

    /**
     * removes containers from the futureContainers list if the future is done (container started).
     * @param sla 
     * @throws FailedToStartNewGridServiceContainersException 
     */
    private void cleanFutureContainers(ContainersSlaPolicy sla)
            throws FailedToStartNewGridServiceContainersException {

        FutureGridServiceContainer future;
        while ((future = state.removeNextDoneFutureContainer(pu)) != null) {
            Exception exception = null;

            try {
                GridServiceContainer container = future.get();
                if (container.isDiscovered()) {
                    logger.info("Container started successfully " + ContainersSlaUtils.gscToString(container));
                }

            } catch (ExecutionException e) {
                // if runtime or error propagate exception "as-is"
                Throwable cause = e.getCause();
                if (cause instanceof TimeoutException || cause instanceof AdminException
                        || cause instanceof InterruptedException) {
                    // expected exception
                    exception = e;
                } else {
                    throw new IllegalStateException("Unexpected Exception when starting a new container.", e);
                }
            } catch (TimeoutException e) {
                exception = e;
            }

            if (exception != null) {
                state.failedFutureContainer(future);
                FailedToStartNewGridServiceContainersException ex = new FailedToStartNewGridServiceContainersException(
                        future.getGridServiceAgent().getMachine(), pu, exception);

                if (sla.isUndeploying()) {
                    logger.info("Ignoring failure to start new container since undeploying.", ex);
                } else {
                    throw ex;
                }
            }
        }

        cleanFailedFutureContainers();
    }

    /**
     * kills and removes containers that are marked for shutdown and have no pu instances deployed
     * on them.
     * @throws OperationInProgressException 
     */
    private void cleanContainersMarkedForShutdown(ProcessingUnit pu) {

        for (final GridServiceContainer container : state.getContainersMarkedForDeallocation(pu)) {

            boolean isContainerDiscovered = container.isDiscovered();

            if (!isContainerDiscovered) {
                logger.debug("Container " + ContainersSlaUtils.gscToString(container)
                        + " has shutdown. Un-marking it from containers marked for shutdown list.");
                // container kill completed
                state.unmarkForShutdownContainer(pu, container);
            } else if (container.getProcessingUnitInstances().length > 0) {
                // cannot kill container since it still has pu instances on it.
                logger.debug("Cannot kill container " + ContainersSlaUtils.gscToString(container)
                        + " since there are still processing unit instances running."
                        + " An instance of this container may be awaiting relocation until more gsc's are available. current running gsc's are : "
                        + ContainersSlaUtils.gscsToString(
                                getProcessingUnit().getAdmin().getGridServiceContainers().getContainers()));
            } else {
                // kill container
                ((InternalAdmin) pu.getAdmin()).scheduleAdminOperation(new Runnable() {
                    public void run() {
                        boolean hasProcessingUnitInstances;
                        try {
                            hasProcessingUnitInstances = ((InternalGridServiceContainer) container)
                                    .hasProcessingUnitInstances();
                        } catch (AdminException e) {
                            logger.info("Cannot determine number of processing unit instances running on container "
                                    + ContainersSlaUtils.gscToString(container), e);
                            return;
                        }

                        if (hasProcessingUnitInstances) {
                            logger.debug("Processing unit instances in container "
                                    + ContainersSlaUtils.gscToString(container) + " are shutting down. "
                                    + "Suspect instance uids:"
                                    + Arrays.toString(((InternalGridServiceContainer) container)
                                            .getUnconfirmedRemovedProcessingUnitInstancesUid()));
                        } else {
                            logger.info("Killing container " + ContainersSlaUtils.gscToString(container)
                                    + " since it is not running any processing unit instances.");
                            try {
                                container.kill();
                            } catch (AdminException e) {
                                logger.info("Cannot kill container " + ContainersSlaUtils.gscToString(container),
                                        e);
                            } catch (IllegalArgumentException e) {
                                //GsaImpl throws IllegalArgumentException instead of AdminException in case the process no longer exists
                                logger.info("Cannot kill container " + ContainersSlaUtils.gscToString(container),
                                        e);
                            }
                        }
                    }
                });
            }
        }
    }

    private void validateEndpointNotDestroyed(ProcessingUnit pu) {

        if (pu == null) {
            throw new IllegalArgumentException("pu cannot be null");
        }

        if (state.isProcessingUnitDestroyed(pu)) {

            throw new IllegalStateException("endpoint destroyed");
        }
    }

    private void cleanFailedFutureContainers() {

        for (FutureGridServiceContainer future : state.getFailedFutureContainers()) {

            int passedSeconds = (int) ((System.currentTimeMillis() - future.getTimestamp().getTime()) / 1000);
            GridServiceAgent agent = future.getGridServiceAgent();
            if (!agent.isDiscovered()) {
                logger.info("Forgetting failure to start container on machine "
                        + ContainersSlaUtils.machineToString(agent.getMachine()) + " that occurred " + passedSeconds
                        + " seconds ago since grid service agent no longer exists.");
                state.removeFailedFuture(future);
            } else {
                terminateOrphanContainersOfAgent(agent, future);
                if (passedSeconds > START_CONTAINER_TIMEOUT_FAILURE_FORGET_SECONDS) {
                    logger.info("Forgetting failure to start container on machine "
                            + ContainersSlaUtils.machineToString(agent.getMachine()) + " that occurred "
                            + passedSeconds + " seconds ago due to timeout.");
                    state.removeFailedFuture(future);
                }
            }
        }
    }

    private void terminateOrphanContainersOfAgent(final GridServiceAgent agent,
            final FutureGridServiceContainer suspectedFuture) {
        final int suspectedOrphanContainerAgentId;
        try {
            suspectedOrphanContainerAgentId = suspectedFuture.getAgentId();
        } catch (TimeoutException e) {
            return; // container not even started
        } catch (ExecutionException e) {
            return; // container not even started
        }

        final Set<Integer> agentIds = new HashSet<Integer>();
        // add all agent's containers process ids.
        for (final AgentProcessDetails processDetails : agent.getProcessesDetails()) {
            if (processDetails.getServiceType().toLowerCase().equals("gsc")) {
                agentIds.add(processDetails.getAgentId());
            }
        }
        // remove all agent's containers process ids that registered with lus.
        for (final GridServiceContainer container : agent.getAdmin().getGridServiceContainers()) {
            if (container.getGridServiceAgent().equals(agent)) {
                agentIds.remove(container.getAgentId());
            }
        }

        for (FutureGridServiceContainer future : state.getFutureContainers()) {
            if (future.getGridServiceAgent().equals(agent) && future.isStarted()) {
                try {
                    agentIds.remove(future.getAgentId());
                } catch (ExecutionException e) {
                    // ignore
                } catch (TimeoutException e) {
                    // ignore
                }
            }
        }

        if (agentIds.contains(suspectedOrphanContainerAgentId)) {
            ((InternalAdmin) pu.getAdmin()).scheduleAdminOperation(new Runnable() {
                public void run() {

                    try {
                        agent.killByAgentId(suspectedOrphanContainerAgentId);
                        logger.warn(
                                "Terminated orphan container that did not register with lookup service on machine "
                                        + ContainersSlaUtils.machineToString(agent.getMachine()) + " agentId="
                                        + suspectedOrphanContainerAgentId);
                    } catch (final AdminException e) {
                        logger.warn(
                                "Error terminating orphan container that did not register with lookup service on machine "
                                        + ContainersSlaUtils.machineToString(agent.getMachine()) + " agentId="
                                        + suspectedOrphanContainerAgentId,
                                e);
                    }

                }
            });
        }
    }

}