org.apache.brooklyn.entity.group.DynamicClusterImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.brooklyn.entity.group.DynamicClusterImpl.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.brooklyn.entity.group;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;

import javax.annotation.Nullable;

import org.apache.brooklyn.api.entity.Entity;
import org.apache.brooklyn.api.entity.EntitySpec;
import org.apache.brooklyn.api.entity.Group;
import org.apache.brooklyn.api.location.Location;
import org.apache.brooklyn.api.location.MachineProvisioningLocation;
import org.apache.brooklyn.api.location.NoMachinesAvailableException;
import org.apache.brooklyn.api.mgmt.Task;
import org.apache.brooklyn.api.policy.Policy;
import org.apache.brooklyn.api.sensor.AttributeSensor;
import org.apache.brooklyn.core.config.Sanitizer;
import org.apache.brooklyn.core.config.render.RendererHints;
import org.apache.brooklyn.core.effector.Effectors;
import org.apache.brooklyn.core.entity.Entities;
import org.apache.brooklyn.core.entity.EntityPredicates;
import org.apache.brooklyn.core.entity.factory.EntityFactory;
import org.apache.brooklyn.core.entity.factory.EntityFactoryForLocation;
import org.apache.brooklyn.core.entity.lifecycle.Lifecycle;
import org.apache.brooklyn.core.entity.lifecycle.ServiceStateLogic;
import org.apache.brooklyn.core.entity.lifecycle.ServiceStateLogic.ServiceProblemsLogic;
import org.apache.brooklyn.core.entity.trait.Resizable;
import org.apache.brooklyn.core.entity.trait.Startable;
import org.apache.brooklyn.core.entity.trait.StartableMethods;
import org.apache.brooklyn.core.location.Locations;
import org.apache.brooklyn.core.location.cloud.AvailabilityZoneExtension;
import org.apache.brooklyn.core.sensor.Sensors;
import org.apache.brooklyn.entity.stock.DelegateEntity;
import org.apache.brooklyn.feed.function.FunctionFeed;
import org.apache.brooklyn.feed.function.FunctionPollConfig;
import org.apache.brooklyn.util.collections.MutableList;
import org.apache.brooklyn.util.collections.MutableMap;
import org.apache.brooklyn.util.collections.QuorumCheck.QuorumChecks;
import org.apache.brooklyn.util.core.flags.TypeCoercions;
import org.apache.brooklyn.util.core.task.DynamicTasks;
import org.apache.brooklyn.util.core.task.TaskTags;
import org.apache.brooklyn.util.core.task.Tasks;
import org.apache.brooklyn.util.exceptions.Exceptions;
import org.apache.brooklyn.util.exceptions.ReferenceWithError;
import org.apache.brooklyn.util.guava.Maybe;
import org.apache.brooklyn.util.javalang.JavaClassNames;
import org.apache.brooklyn.util.javalang.Reflections;
import org.apache.brooklyn.util.text.StringPredicates;
import org.apache.brooklyn.util.text.Strings;
import org.apache.brooklyn.util.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.common.reflect.TypeToken;

/**
 * A cluster of entities that can dynamically increase or decrease the number of entities.
 */
public class DynamicClusterImpl extends AbstractGroupImpl implements DynamicCluster {

    @SuppressWarnings("serial")
    private static final AttributeSensor<Supplier<Integer>> NEXT_CLUSTER_MEMBER_ID = Sensors
            .newSensor(new TypeToken<Supplier<Integer>>() {
            }, "next.cluster.member.id", "Returns the ID number of the next member to be added");

    private volatile FunctionFeed clusterOneAndAllMembersUp;

    // TODO better mechanism for arbitrary class name to instance type coercion
    static {
        TypeCoercions.registerAdapter(String.class, NodePlacementStrategy.class,
                new Function<String, NodePlacementStrategy>() {
                    @Override
                    public NodePlacementStrategy apply(final String input) {
                        ClassLoader classLoader = NodePlacementStrategy.class.getClassLoader();
                        Optional<NodePlacementStrategy> strategy = Reflections
                                .<NodePlacementStrategy>invokeConstructorWithArgs(classLoader, input);
                        if (strategy.isPresent()) {
                            return strategy.get();
                        } else {
                            throw new IllegalStateException("Failed to create NodePlacementStrategy " + input);
                        }
                    }
                });
        TypeCoercions.registerAdapter(String.class, ZoneFailureDetector.class,
                new Function<String, ZoneFailureDetector>() {
                    @Override
                    public ZoneFailureDetector apply(final String input) {
                        ClassLoader classLoader = ZoneFailureDetector.class.getClassLoader();
                        Optional<ZoneFailureDetector> detector = Reflections
                                .<ZoneFailureDetector>invokeConstructorWithArgs(classLoader, input);
                        if (detector.isPresent()) {
                            return detector.get();
                        } else {
                            throw new IllegalStateException("Failed to create ZoneFailureDetector " + input);
                        }
                    }
                });
    }

    static {
        RendererHints.register(FIRST,
                RendererHints.namedActionWithUrl("Open", DelegateEntity.EntityUrl.entityUrl()));
        RendererHints.register(CLUSTER,
                RendererHints.namedActionWithUrl("Open", DelegateEntity.EntityUrl.entityUrl()));
    }

    private static final Logger LOG = LoggerFactory.getLogger(DynamicClusterImpl.class);

    /**
     * Mutex for synchronizing during re-size operations.
     * Sub-classes should use with great caution, to not introduce deadlocks!
     */
    protected final Object mutex = new Object[0];

    private static final Function<Collection<Entity>, Entity> defaultRemovalStrategy = new Function<Collection<Entity>, Entity>() {
        @Override
        public Entity apply(Collection<Entity> contenders) {
            /*
             * Choose the newest entity (largest cluster member ID or latest timestamp) that is stoppable.
             * If none are stoppable, take the newest non-stoppable.
             * 
             * Both cluster member ID and timestamp must be taken into consideration to account for legacy
             * clusters that were created before the addition of the cluster member ID config value.
             */
            int largestClusterMemberId = -1;
            long newestTime = 0L;
            Entity newest = null;

            for (Entity contender : contenders) {
                Integer contenderClusterMemberId = contender.config().get(CLUSTER_MEMBER_ID);
                long contenderCreationTime = contender.getCreationTime();

                boolean newer = (contenderClusterMemberId != null
                        && contenderClusterMemberId > largestClusterMemberId) || contenderCreationTime > newestTime;

                if ((contender instanceof Startable && newer)
                        || (!(newest instanceof Startable) && ((contender instanceof Startable) || newer))) {
                    newest = contender;

                    if (contenderClusterMemberId != null)
                        largestClusterMemberId = contenderClusterMemberId;
                    newestTime = contenderCreationTime;
                }
            }

            return newest;
        }
    };

    private static class NextClusterMemberIdSupplier implements Supplier<Integer> {
        private AtomicInteger nextId = new AtomicInteger(0);

        @Override
        public Integer get() {
            return nextId.getAndIncrement();
        }
    }

    public DynamicClusterImpl() {
    }

    @Override
    public void init() {
        super.init();
        initialiseMemberId();
        connectAllMembersUp();
    }

    private void initialiseMemberId() {
        synchronized (mutex) {
            if (sensors().get(NEXT_CLUSTER_MEMBER_ID) == null) {
                sensors().set(NEXT_CLUSTER_MEMBER_ID, new NextClusterMemberIdSupplier());
            }
        }
    }

    private void connectAllMembersUp() {
        clusterOneAndAllMembersUp = FunctionFeed.builder().entity(this).period(Duration.FIVE_SECONDS)
                .poll(new FunctionPollConfig<Boolean, Boolean>(CLUSTER_ONE_AND_ALL_MEMBERS_UP)
                        .onException(Functions.constant(Boolean.FALSE))
                        .callable(new ClusterOneAndAllMembersUpCallable(this)))
                .build();
    }

    private static class ClusterOneAndAllMembersUpCallable implements Callable<Boolean> {

        private final Group cluster;

        public ClusterOneAndAllMembersUpCallable(Group cluster) {
            this.cluster = cluster;
        }

        @Override
        public Boolean call() throws Exception {
            if (cluster.getMembers().isEmpty())
                return false;

            if (Lifecycle.RUNNING != cluster.sensors().get(SERVICE_STATE_ACTUAL))
                return false;

            for (Entity member : cluster.getMembers())
                if (!Boolean.TRUE.equals(member.sensors().get(SERVICE_UP)))
                    return false;

            return true;
        }
    }

    @Override
    protected void initEnrichers() {
        if (config().getRaw(UP_QUORUM_CHECK).isAbsent() && getConfig(INITIAL_SIZE) == 0) {
            // if initial size is 0 then override up check to allow zero if empty
            config().set(UP_QUORUM_CHECK, QuorumChecks.atLeastOneUnlessEmpty());
            sensors().set(SERVICE_UP, true);
        } else {
            sensors().set(SERVICE_UP, false);
        }
        super.initEnrichers();
        // override previous enricher so that only members are checked
        ServiceStateLogic.newEnricherFromChildrenUp().checkMembersOnly()
                .requireUpChildren(getConfig(UP_QUORUM_CHECK)).addTo(this);
    }

    @Override
    public void setRemovalStrategy(Function<Collection<Entity>, Entity> val) {
        config().set(REMOVAL_STRATEGY, checkNotNull(val, "removalStrategy"));
    }

    protected Function<Collection<Entity>, Entity> getRemovalStrategy() {
        Function<Collection<Entity>, Entity> result = getConfig(REMOVAL_STRATEGY);
        return (result != null) ? result : defaultRemovalStrategy;
    }

    @Override
    public void setZonePlacementStrategy(NodePlacementStrategy val) {
        config().set(ZONE_PLACEMENT_STRATEGY, checkNotNull(val, "zonePlacementStrategy"));
    }

    protected NodePlacementStrategy getZonePlacementStrategy() {
        return checkNotNull(getConfig(ZONE_PLACEMENT_STRATEGY), "zonePlacementStrategy config");
    }

    @Override
    public void setZoneFailureDetector(ZoneFailureDetector val) {
        config().set(ZONE_FAILURE_DETECTOR, checkNotNull(val, "zoneFailureDetector"));
    }

    protected ZoneFailureDetector getZoneFailureDetector() {
        return checkNotNull(getConfig(ZONE_FAILURE_DETECTOR), "zoneFailureDetector config");
    }

    protected EntitySpec<?> getFirstMemberSpec() {
        return getConfig(FIRST_MEMBER_SPEC);
    }

    protected EntitySpec<?> getMemberSpec() {
        return getConfig(MEMBER_SPEC);
    }

    /** @deprecated since 0.7.0; use {@link #getMemberSpec()} */
    @Deprecated
    protected EntityFactory<?> getFactory() {
        return getConfig(FACTORY);
    }

    @Override
    public void setMemberSpec(EntitySpec<?> memberSpec) {
        setConfigEvenIfOwned(MEMBER_SPEC, memberSpec);
    }

    /** @deprecated since 0.7.0; use {@link #setMemberSpec(EntitySpec)} */
    @Deprecated
    @Override
    public void setFactory(EntityFactory<?> factory) {
        setConfigEvenIfOwned(FACTORY, factory);
    }

    private Location getLocation(boolean required) {
        Collection<? extends Location> ll = Locations.getLocationsCheckingAncestors(getLocations(), this);
        if (ll.isEmpty()) {
            if (!required)
                return null;
            throw new IllegalStateException("No location available for " + this);
        }
        if (ll.size() > 1) {
            throw new IllegalStateException("Ambiguous location for " + this + "; expected one but had " + ll);
        }
        return Iterables.getOnlyElement(ll);
    }

    protected boolean isAvailabilityZoneEnabled() {
        return getConfig(ENABLE_AVAILABILITY_ZONES);
    }

    protected boolean isQuarantineEnabled() {
        return getConfig(QUARANTINE_FAILED_ENTITIES);
    }

    protected QuarantineGroup getQuarantineGroup() {
        return getAttribute(QUARANTINE_GROUP);
    }

    protected Predicate<? super Throwable> getQuarantineFilter() {
        Predicate<? super Throwable> result = getConfig(QUARANTINE_FILTER);
        if (result != null) {
            return result;
        } else {
            return new Predicate<Throwable>() {
                @Override
                public boolean apply(Throwable input) {
                    return Exceptions.getFirstThrowableOfType(input, NoMachinesAvailableException.class) == null;
                }
            };
        }
    }

    protected int getInitialQuorumSize() {
        int initialSize = getConfig(INITIAL_SIZE).intValue();
        int initialQuorumSize = getConfig(INITIAL_QUORUM_SIZE).intValue();
        if (initialQuorumSize < 0)
            initialQuorumSize = initialSize;
        if (initialQuorumSize > initialSize) {
            LOG.warn(
                    "On start of cluster {}, misconfigured initial quorum size {} greater than initial size{}; using {}",
                    new Object[] { initialQuorumSize, initialSize, initialSize });
            initialQuorumSize = initialSize;
        }
        return initialQuorumSize;
    }

    @Override
    public void start(Collection<? extends Location> locsO) {
        addLocations(locsO);
        Location loc = getLocation(false);

        EntitySpec<?> spec = getConfig(MEMBER_SPEC);
        if (spec != null) {
            setDefaultDisplayName("Cluster of " + JavaClassNames.simpleClassName(spec.getType())
                    + (loc != null ? " (" + loc + ")" : ""));
        }

        if (isAvailabilityZoneEnabled()) {
            if (loc == null)
                throw new IllegalStateException(
                        "When using availability zones, a location must be specified on the cluster");
            sensors().set(SUB_LOCATIONS, findSubLocations(loc));
        }

        ServiceStateLogic.setExpectedState(this, Lifecycle.STARTING);
        ServiceProblemsLogic.clearProblemsIndicator(this, START);
        try {
            doStart();
            DynamicTasks.waitForLast();

        } catch (Exception e) {
            ServiceProblemsLogic.updateProblemsIndicator(this, START, "start failed with error: " + e);
            throw Exceptions.propagate(e);
        } finally {
            ServiceStateLogic.setExpectedState(this, Lifecycle.RUNNING);
        }
    }

    protected void doStart() {
        if (isQuarantineEnabled()) {
            QuarantineGroup quarantineGroup = getAttribute(QUARANTINE_GROUP);
            if (quarantineGroup == null || !Entities.isManaged(quarantineGroup)) {
                quarantineGroup = addChild(EntitySpec.create(QuarantineGroup.class).displayName("quarantine"));
                sensors().set(QUARANTINE_GROUP, quarantineGroup);
            }
        }

        int initialSize = getConfig(INITIAL_SIZE).intValue();
        int initialQuorumSize = getInitialQuorumSize();
        Exception internalError = null;

        try {
            resize(initialSize);
        } catch (Exception e) {
            Exceptions.propagateIfFatal(e);
            // Apart from logging, ignore problems here; we extract them below.
            // But if it was this thread that threw the exception (rather than a sub-task), then need
            // to record that failure here.
            LOG.debug("Error resizing " + this + " to size " + initialSize + " (collecting and handling): " + e, e);
            internalError = e;
        }

        Iterable<Task<?>> failed = Tasks.failed(Tasks.children(Tasks.current()));
        boolean noFailed = Iterables.isEmpty(failed);
        boolean severalFailed = Iterables.size(failed) > 1;

        int currentSize = getCurrentSize().intValue();
        if (currentSize < initialQuorumSize) {
            String message;
            if (currentSize == 0 && !noFailed) {
                if (severalFailed)
                    message = "All nodes in cluster " + this + " failed";
                else
                    message = "Node in cluster " + this + " failed";
            } else {
                message = "On start of cluster " + this + ", failed to get to initial size of " + initialSize
                        + "; size is " + getCurrentSize()
                        + (initialQuorumSize != initialSize ? " (initial quorum size is " + initialQuorumSize + ")"
                                : "");
            }
            Throwable firstError = Tasks.getError(Maybe.next(failed.iterator()).orNull());
            if (firstError == null && internalError != null) {
                // only use the internal error if there were no nested task failures
                // (otherwise the internal error should be a wrapper around the nested failures)
                firstError = internalError;
            }
            if (firstError != null) {
                if (severalFailed) {
                    message += "; first failure is: " + Exceptions.collapseText(firstError);
                } else {
                    message += ": " + Exceptions.collapseText(firstError);
                }
            }
            throw new IllegalStateException(message, firstError);

        } else if (currentSize < initialSize) {
            LOG.warn(
                    "On start of cluster {}, size {} reached initial minimum quorum size of {} but did not reach desired size {}; continuing",
                    new Object[] { this, currentSize, initialQuorumSize, initialSize });
        }

        for (Policy it : policies()) {
            it.resume();
        }
    }

    protected List<Location> findSubLocations(Location loc) {
        if (!loc.hasExtension(AvailabilityZoneExtension.class)) {
            throw new IllegalStateException("Availability zone extension not supported for location " + loc);
        }

        AvailabilityZoneExtension zoneExtension = loc.getExtension(AvailabilityZoneExtension.class);

        Collection<String> zoneNames = getConfig(AVAILABILITY_ZONE_NAMES);
        Integer numZones = getConfig(NUM_AVAILABILITY_ZONES);

        List<Location> subLocations;
        if (zoneNames == null || zoneNames.isEmpty()) {
            if (numZones != null) {
                subLocations = zoneExtension.getSubLocations(numZones);

                checkArgument(numZones > 0, "numZones must be greater than zero: %s", numZones);
                if (numZones > subLocations.size()) {
                    throw new IllegalStateException("Number of required zones (" + numZones + ") not satisfied in "
                            + loc + "; only " + subLocations.size() + " available: " + subLocations);
                }
            } else {
                subLocations = zoneExtension.getAllSubLocations();
            }
        } else {
            // TODO check that these are valid region / availabilityZones?
            subLocations = zoneExtension.getSubLocationsByName(StringPredicates.equalToAny(zoneNames),
                    zoneNames.size());

            if (zoneNames.size() > subLocations.size()) {
                throw new IllegalStateException(
                        "Number of required zones (" + zoneNames.size() + " - " + zoneNames + ") not satisfied in "
                                + loc + "; only " + subLocations.size() + " available: " + subLocations);
            }
        }

        LOG.info("Returning {} sub-locations: {}", subLocations.size(), Iterables.toString(subLocations));
        return subLocations;
    }

    @Override
    public void stop() {
        ServiceStateLogic.setExpectedState(this, Lifecycle.STOPPING);
        try {
            for (Policy it : policies()) {
                it.suspend();
            }

            // run shrink without mutex to make things stop even if starting,
            int size = getCurrentSize();
            if (size > 0) {
                shrink(-size);
            }

            // run resize with mutex to prevent others from starting things
            resize(0);

            // also stop any remaining stoppable children -- eg those on fire
            // (this ignores the quarantine node which is not stoppable)
            StartableMethods.stop(this);

            ServiceStateLogic.setExpectedState(this, Lifecycle.STOPPED);
        } catch (Exception e) {
            ServiceStateLogic.setExpectedState(this, Lifecycle.ON_FIRE);
            throw Exceptions.propagate(e);
        } finally {
            if (clusterOneAndAllMembersUp != null)
                clusterOneAndAllMembersUp.stop();
        }
    }

    @Override
    public void restart() {
        String mode = getConfig(RESTART_MODE);
        if (mode == null) {
            throw new UnsupportedOperationException(
                    "Restart not supported for this cluster: " + RESTART_MODE.getName() + " is not configured.");
        }
        if ("off".equalsIgnoreCase(mode)) {
            throw new UnsupportedOperationException("Restart not supported for this cluster.");
        }

        if ("sequential".equalsIgnoreCase(mode)) {
            ServiceStateLogic.setExpectedState(this, Lifecycle.STARTING);
            DynamicTasks
                    .queue(Effectors.invocationSequential(Startable.RESTART, null, Iterables.filter(getChildren(),
                            Predicates.and(Predicates.instanceOf(Startable.class), EntityPredicates.isManaged()))));
        } else if ("parallel".equalsIgnoreCase(mode)) {
            ServiceStateLogic.setExpectedState(this, Lifecycle.STARTING);
            DynamicTasks.queue(Effectors.invocationParallel(Startable.RESTART, null, Iterables.filter(getChildren(),
                    Predicates.and(Predicates.instanceOf(Startable.class), EntityPredicates.isManaged()))));
        } else {
            throw new IllegalArgumentException("Unknown " + RESTART_MODE.getName() + " '" + mode + "'");
        }

        DynamicTasks.waitForLast();
        ServiceStateLogic.setExpectedState(this, Lifecycle.RUNNING);
    }

    @Override
    public Integer resize(Integer desiredSize) {
        synchronized (mutex) {
            int originalSize = getCurrentSize();
            int delta = desiredSize - originalSize;
            if (delta != 0) {
                LOG.info("Resize {} from {} to {}", new Object[] { this, originalSize, desiredSize });
            } else {
                if (LOG.isDebugEnabled())
                    LOG.debug("Resize no-op {} from {} to {}", new Object[] { this, originalSize, desiredSize });
            }
            // If we managed to grow at all, then expect no exception.
            // Otherwise, if failed because NoMachinesAvailable, then propagate as InsufficientCapacityException.
            // This tells things like the AutoScalerPolicy to not keep retrying.
            try {
                resizeByDelta(delta);
            } catch (Exception e) {
                Exceptions.propagateIfFatal(e);
                NoMachinesAvailableException nmae = Exceptions.getFirstThrowableOfType(e,
                        NoMachinesAvailableException.class);
                if (nmae != null) {
                    throw new Resizable.InsufficientCapacityException("Failed to resize", e);
                } else {
                    throw Exceptions.propagate(e);
                }
            }
        }
        return getCurrentSize();
    }

    /**
     * {@inheritDoc}
     *
     * <strong>Note</strong> for sub-classes; this method can be called while synchronized on {@link #mutex}.
     */
    @Override
    public String replaceMember(String memberId) {
        Entity member = getEntityManager().getEntity(memberId);
        LOG.info("In {}, replacing member {} ({})", new Object[] { this, memberId, member });

        if (member == null) {
            throw new NoSuchElementException(
                    "In " + this + ", entity " + memberId + " cannot be resolved, so not replacing");
        }

        synchronized (mutex) {
            if (!getMembers().contains(member)) {
                throw new NoSuchElementException(
                        "In " + this + ", entity " + member + " is not a member so not replacing");
            }

            Location memberLoc = null;
            if (isAvailabilityZoneEnabled()) {
                // this member's location could be a machine provisioned by a sub-location, or the actual sub-location
                List<Location> subLocations = findSubLocations(getLocation(true));
                Collection<Location> actualMemberLocs = member.getLocations();
                boolean foundMatch = false;
                for (Iterator<Location> iter = actualMemberLocs.iterator(); !foundMatch && iter.hasNext();) {
                    Location actualMemberLoc = iter.next();
                    Location contenderMemberLoc = actualMemberLoc;
                    do {
                        if (subLocations.contains(contenderMemberLoc)) {
                            memberLoc = contenderMemberLoc;
                            foundMatch = true;
                            LOG.debug("In {} replacing member {} ({}), inferred its sub-location is {}",
                                    new Object[] { this, memberId, member, memberLoc });
                        }
                        contenderMemberLoc = contenderMemberLoc.getParent();
                    } while (!foundMatch && contenderMemberLoc != null);
                }
                if (!foundMatch) {
                    if (actualMemberLocs.isEmpty()) {
                        memberLoc = subLocations.get(0);
                        LOG.warn(
                                "In {} replacing member {} ({}), has no locations; falling back to first availability zone: {}",
                                new Object[] { this, memberId, member, memberLoc });
                    } else {
                        memberLoc = Iterables
                                .tryFind(actualMemberLocs, Predicates.instanceOf(MachineProvisioningLocation.class))
                                .or(Iterables.getFirst(actualMemberLocs, null));
                        LOG.warn(
                                "In {} replacing member {} ({}), could not find matching sub-location; falling back to its actual location: {}",
                                new Object[] { this, memberId, member, memberLoc });
                    }
                } else if (memberLoc == null) {
                    // impossible to get here, based on logic above!
                    throw new IllegalStateException("Unexpected condition! cluster=" + this + "; member=" + member
                            + "; actualMemberLocs=" + actualMemberLocs);
                }
            } else {
                // Replacing member, so new member should be in the same location as that being replaced.
                // Expect this to agree with `getMemberSpec().getLocations()` (if set). If not, then 
                // presumably there was a reason this specific member was started somewhere else!
                memberLoc = getLocation(false);
            }

            Entity replacement = replaceMember(member, memberLoc, ImmutableMap.of());
            return replacement.getId();
        }
    }

    /**
     * @throws StopFailedRuntimeException If stop failed, after successfully starting replacement
     */
    protected Entity replaceMember(Entity member, @Nullable Location memberLoc, Map<?, ?> extraFlags) {
        synchronized (mutex) {
            ReferenceWithError<Optional<Entity>> added = addInSingleLocation(memberLoc, extraFlags);

            if (!added.getWithoutError().isPresent()) {
                String msg = String.format("In %s, failed to grow, to replace %s; not removing", this, member);
                if (added.hasError())
                    throw new IllegalStateException(msg, added.getError());
                throw new IllegalStateException(msg);
            }

            try {
                stopAndRemoveNode(member);
            } catch (Exception e) {
                Exceptions.propagateIfFatal(e);
                throw new StopFailedRuntimeException(
                        "replaceMember failed to stop and remove old member " + member.getId(), e);
            }

            return added.getWithError().get();
        }
    }

    protected Multimap<Location, Entity> getMembersByLocation() {
        Multimap<Location, Entity> result = LinkedHashMultimap.create();
        for (Entity member : getMembers()) {
            Collection<Location> memberLocs = member.getLocations();
            Location memberLoc = Iterables.getFirst(memberLocs, null);
            if (memberLoc != null) {
                result.put(memberLoc, member);
            }
        }
        return result;
    }

    protected List<Location> getNonFailedSubLocations() {
        List<Location> result = Lists.newArrayList();
        Set<Location> failed = Sets.newLinkedHashSet();
        List<Location> subLocations = findSubLocations(getLocation(true));
        Set<Location> oldFailedSubLocations = getAttribute(FAILED_SUB_LOCATIONS);
        if (oldFailedSubLocations == null)
            oldFailedSubLocations = ImmutableSet.<Location>of();

        for (Location subLocation : subLocations) {
            if (getZoneFailureDetector().hasFailed(subLocation)) {
                failed.add(subLocation);
            } else {
                result.add(subLocation);
            }
        }

        Set<Location> newlyFailed = Sets.difference(failed, oldFailedSubLocations);
        Set<Location> newlyRecovered = Sets.difference(oldFailedSubLocations, failed);
        sensors().set(FAILED_SUB_LOCATIONS, failed);
        sensors().set(SUB_LOCATIONS, result);
        if (newlyFailed.size() > 0) {
            LOG.warn("Detected probably zone failures for {}: {}", this, newlyFailed);
        }
        if (newlyRecovered.size() > 0) {
            LOG.warn("Detected probably zone recoveries for {}: {}", this, newlyRecovered);
        }

        return result;
    }

    /**
     * {@inheritDoc}
     *
     * <strong>Note</strong> for sub-classes; this method can be called while synchronized on {@link #mutex}.
     */
    @Override
    public Collection<Entity> resizeByDelta(int delta) {
        synchronized (mutex) {
            if (delta > 0) {
                return grow(delta);
            } else if (delta < 0) {
                return shrink(delta);
            } else {
                return ImmutableList.<Entity>of();
            }
        }
    }

    /** <strong>Note</strong> for sub-classes; this method can be called while synchronized on {@link #mutex}. */
    protected Collection<Entity> grow(int delta) {
        Preconditions.checkArgument(delta > 0, "Must call grow with positive delta.");

        // choose locations to be deployed to
        List<Location> chosenLocations;
        List<Location> memberLocations = getMemberSpec() == null ? null : getMemberSpec().getLocations();
        if (memberLocations != null && memberLocations.size() > 0) {
            // The memberSpec overrides the location passed to cluster.start(); use
            // the location defined on the member.
            if (isAvailabilityZoneEnabled()) {
                LOG.warn(
                        "Cluster {} has availability-zone enabled, but memberSpec overrides location with {}; using "
                                + "memberSpec's location; availability-zone behaviour will not apply",
                        this, memberLocations);
            }
            chosenLocations = Collections.nCopies(delta, memberLocations.get(0));
        } else if (isAvailabilityZoneEnabled()) {
            List<Location> subLocations = getNonFailedSubLocations();
            Multimap<Location, Entity> membersByLocation = getMembersByLocation();
            chosenLocations = getZonePlacementStrategy().locationsForAdditions(membersByLocation, subLocations,
                    delta);
            if (chosenLocations.size() != delta) {
                throw new IllegalStateException("Node placement strategy chose " + Iterables.size(chosenLocations)
                        + ", when expected delta " + delta + " in " + this);
            }
        } else {
            chosenLocations = Collections.nCopies(delta, getLocation(false));
        }

        // create and start the entities.
        // if any fail, then propagate the error.
        ReferenceWithError<Collection<Entity>> result = addInEachLocation(chosenLocations, ImmutableMap.of());
        return result.getWithError();
    }

    /** <strong>Note</strong> for sub-clases; this method can be called while synchronized on {@link #mutex}. */
    @SuppressWarnings("unchecked")
    protected Collection<Entity> shrink(int delta) {
        Preconditions.checkArgument(delta < 0, "Must call shrink with negative delta.");
        int size = getCurrentSize();
        if (-delta > size) {
            // some subclasses (esp in tests) use custom sizes without the members set always being accurate, so put a limit on the size
            LOG.warn("Call to shrink " + this + " by " + delta + " when size is " + size + "; amending");
            delta = -size;
        }
        if (delta == 0)
            return ImmutableList.<Entity>of();

        Collection<Entity> removedEntities = pickAndRemoveMembers(delta * -1);

        // FIXME symmetry in order of added as child, managed, started, and added to group
        Task<?> invoke = Entities.invokeEffector(this,
                (Iterable<Entity>) (Iterable<?>) Iterables.filter(removedEntities, Startable.class), Startable.STOP,
                Collections.<String, Object>emptyMap());
        try {
            invoke.get();
            return removedEntities;
        } catch (Exception e) {
            throw Exceptions.propagate(e);
        } finally {
            for (Entity removedEntity : removedEntities) {
                discardNode(removedEntity);
            }
        }
    }

    protected ReferenceWithError<Optional<Entity>> addInSingleLocation(@Nullable Location location,
            Map<?, ?> flags) {
        ReferenceWithError<Collection<Entity>> added = addInEachLocation(Arrays.asList(location), flags);

        Optional<Entity> result = Iterables.isEmpty(added.getWithoutError()) ? Optional.<Entity>absent()
                : Optional.of(Iterables.getOnlyElement(added.get()));
        if (!added.hasError()) {
            return ReferenceWithError.newInstanceWithoutError(result);
        } else {
            if (added.masksErrorIfPresent()) {
                return ReferenceWithError.newInstanceMaskingError(result, added.getError());
            } else {
                return ReferenceWithError.newInstanceThrowingError(result, added.getError());
            }
        }
    }

    protected ReferenceWithError<Collection<Entity>> addInEachLocation(Iterable<Location> locations,
            Map<?, ?> flags) {
        List<Entity> addedEntities = Lists.newArrayList();
        Map<Entity, Location> addedEntityLocations = Maps.newLinkedHashMap();
        Map<Entity, Task<?>> tasks = Maps.newLinkedHashMap();

        for (Location loc : locations) {
            Entity entity = addNode(loc, flags);
            addedEntities.add(entity);
            addedEntityLocations.put(entity, loc);
            if (entity instanceof Startable) {
                Map<String, ?> args = ImmutableMap.of("locations", ImmutableList.of(loc));
                Task<Void> task = Effectors.invocation(entity, Startable.START, args).asTask();
                tasks.put(entity, task);
            }
        }

        Task<List<?>> parallel = Tasks.parallel(
                "starting " + tasks.size() + " node" + Strings.s(tasks.size()) + " (parallel)", tasks.values());
        TaskTags.markInessential(parallel);
        DynamicTasks.queueIfPossible(parallel).orSubmitAsync(this);
        Map<Entity, Throwable> errors = waitForTasksOnEntityStart(tasks);

        // if tracking, then report success/fail to the ZoneFailureDetector
        if (isAvailabilityZoneEnabled()) {
            for (Map.Entry<Entity, Location> entry : addedEntityLocations.entrySet()) {
                Entity entity = entry.getKey();
                Location loc = entry.getValue();
                Throwable err = errors.get(entity);
                if (err == null) {
                    getZoneFailureDetector().onStartupSuccess(loc, entity);
                } else {
                    getZoneFailureDetector().onStartupFailure(loc, entity, err);
                }
            }
        }

        Collection<Entity> result = MutableList.<Entity>builder().addAll(addedEntities).removeAll(errors.keySet())
                .build();

        // quarantine/cleanup as necessary
        if (!errors.isEmpty()) {
            if (isQuarantineEnabled()) {
                quarantineFailedNodes(errors);
            } else {
                cleanupFailedNodes(errors.keySet());
            }
            return ReferenceWithError.newInstanceMaskingError(result, Exceptions.create(errors.values()));
        }

        return ReferenceWithError.newInstanceWithoutError(result);
    }

    protected void quarantineFailedNodes(Map<Entity, Throwable> failedEntities) {
        for (Map.Entry<Entity, Throwable> entry : failedEntities.entrySet()) {
            Entity entity = entry.getKey();
            Throwable cause = entry.getValue();
            if (cause == null || getQuarantineFilter().apply(cause)) {
                sensors().emit(ENTITY_QUARANTINED, entity);
                getQuarantineGroup().addMember(entity);
                removeMember(entity);
            } else {
                LOG.info("Cluster {} discarding failed node {}, rather than quarantining", this, entity);
                discardNode(entity);
            }
        }
    }

    protected void cleanupFailedNodes(Collection<Entity> failedEntities) {
        // TODO Could also call stop on them?
        for (Entity entity : failedEntities) {
            discardNode(entity);
        }
    }

    protected Map<Entity, Throwable> waitForTasksOnEntityStart(Map<? extends Entity, ? extends Task<?>> tasks) {
        // TODO Could have CompoundException, rather than propagating first
        Map<Entity, Throwable> errors = Maps.newLinkedHashMap();

        for (Map.Entry<? extends Entity, ? extends Task<?>> entry : tasks.entrySet()) {
            Entity entity = entry.getKey();
            Task<?> task = entry.getValue();
            try {
                task.get();
            } catch (InterruptedException e) {
                throw Exceptions.propagate(e);
            } catch (Throwable t) {
                Throwable interesting = Exceptions.getFirstInteresting(t);
                LOG.error("Cluster " + this + " failed to start entity " + entity + " (removing): " + interesting,
                        interesting);
                LOG.debug("Trace for: Cluster " + this + " failed to start entity " + entity + " (removing): " + t,
                        t);
                // previously we unwrapped but now there is no need I think
                errors.put(entity, t);
            }
        }
        return errors;
    }

    @Override
    public boolean removeChild(Entity child) {
        boolean changed = super.removeChild(child);
        if (changed) {
            removeMember(child);
        }
        return changed;
    }

    protected Map<?, ?> getCustomChildFlags() {
        return getConfig(CUSTOM_CHILD_FLAGS);
    }

    @Override
    public Entity addNode(@Nullable Location loc, Map<?, ?> extraFlags) {
        // In case subclasses are foolish and do not call super.init() when overriding.
        initialiseMemberId();
        Map<?, ?> createFlags = MutableMap.builder().putAll(getCustomChildFlags()).putAll(extraFlags)
                .put(CLUSTER_MEMBER_ID, sensors().get(NEXT_CLUSTER_MEMBER_ID).get()).build();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Creating and adding a node to cluster {}({}) with properties {}",
                    new Object[] { this, getId(), Sanitizer.sanitize(createFlags) });
        }

        // TODO should refactor to have a createNodeSpec; and spec should support initial sensor values 
        Entity entity = createNode(loc, createFlags);

        entity.sensors().set(CLUSTER_MEMBER, true);
        entity.sensors().set(CLUSTER, this);

        // Continue to call manage(), because some uses of NodeFactory (in tests) still instantiate the
        // entity via its constructor
        Entities.manage(entity);

        addMember(entity);
        return entity;
    }

    protected Entity createNode(@Nullable Location loc, Map<?, ?> flags) {
        EntitySpec<?> memberSpec = null;
        if (getMembers().isEmpty())
            memberSpec = getFirstMemberSpec();
        if (memberSpec == null)
            memberSpec = getMemberSpec();

        if (memberSpec != null) {
            EntitySpec<?> specConfigured = EntitySpec.create(memberSpec).configure(flags);
            if (loc != null)
                specConfigured.location(loc);
            return addChild(specConfigured);
        }

        EntityFactory<?> factory = getFactory();
        if (factory == null) {
            throw new IllegalStateException(
                    "No member spec nor entity factory supplied for dynamic cluster " + this);
        }
        EntityFactory<?> factoryToUse = (factory instanceof EntityFactoryForLocation)
                ? ((EntityFactoryForLocation<?>) factory).newFactoryForLocation(loc)
                : factory;
        Entity entity = factoryToUse.newEntity(flags, this);
        if (entity == null) {
            throw new IllegalStateException("EntityFactory factory routine returned null entity, in " + this);
        }
        if (entity.getParent() == null)
            entity.setParent(this);

        return entity;
    }

    protected List<Entity> pickAndRemoveMembers(int delta) {
        if (delta == 0)
            return Lists.newArrayList();

        if (delta == 1 && !isAvailabilityZoneEnabled()) {
            Maybe<Entity> member = tryPickAndRemoveMember();
            return (member.isPresent()) ? ImmutableList.of(member.get()) : ImmutableList.<Entity>of();
        }

        // TODO inefficient impl
        Preconditions.checkState(getMembers().size() > 0,
                "Attempt to remove a node (delta " + delta + ") when members is empty, from cluster " + this);
        if (LOG.isDebugEnabled())
            LOG.debug("Removing a node from {}", this);

        if (isAvailabilityZoneEnabled()) {
            Multimap<Location, Entity> membersByLocation = getMembersByLocation();
            List<Entity> entities = getZonePlacementStrategy().entitiesToRemove(membersByLocation, delta);

            Preconditions.checkState(entities.size() == delta,
                    "Incorrect num entity chosen for removal from %s (%s when expected %s)", getId(),
                    entities.size(), delta);

            for (Entity entity : entities) {
                removeMember(entity);
            }
            return entities;
        } else {
            List<Entity> entities = Lists.newArrayList();
            for (int i = 0; i < delta; i++) {
                // don't assume we have enough members; e.g. if shrinking to zero and someone else concurrently stops a member,
                // then just return what we were able to remove.
                Maybe<Entity> member = tryPickAndRemoveMember();
                if (member.isPresent())
                    entities.add(member.get());
            }
            return entities;
        }
    }

    private Maybe<Entity> tryPickAndRemoveMember() {
        assert !isAvailabilityZoneEnabled() : "should instead call pickAndRemoveMembers(int) if using availability zones";

        // TODO inefficient impl
        Collection<Entity> members = getMembers();
        if (members.isEmpty())
            return Maybe.absent();

        if (LOG.isDebugEnabled())
            LOG.debug("Removing a node from {}", this);
        Entity entity = getRemovalStrategy().apply(members);
        Preconditions.checkNotNull(entity, "No entity chosen for removal from " + getId());

        removeMember(entity);
        return Maybe.of(entity);
    }

    protected void discardNode(Entity entity) {
        removeMember(entity);
        try {
            Entities.unmanage(entity);
        } catch (IllegalStateException e) {
            //probably already unmanaged
            LOG.debug("Exception during removing member of cluster " + this + ", unmanaging node " + entity
                    + ". The node is probably already unmanaged.", e);
        }
    }

    protected void stopAndRemoveNode(Entity member) {
        removeMember(member);

        try {
            if (member instanceof Startable) {
                Task<?> task = member.invoke(Startable.STOP, Collections.<String, Object>emptyMap());
                task.getUnchecked();
            }
        } finally {
            Entities.unmanage(member);
        }
    }
}