org.apache.bookkeeper.client.RackawareEnsemblePlacementPolicyImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.bookkeeper.client.RackawareEnsemblePlacementPolicyImpl.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.bookkeeper.client;

import static com.google.common.base.Preconditions.checkNotNull;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.BOOKIES_JOINED;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.BOOKIES_LEFT;
import static org.apache.bookkeeper.bookie.BookKeeperServerStats.FAILED_TO_RESOLVE_NETWORK_LOCATION_COUNTER;
import static org.apache.bookkeeper.client.BookKeeperClientStats.CLIENT_SCOPE;
import static org.apache.bookkeeper.client.BookKeeperClientStats.NUM_WRITABLE_BOOKIES_IN_DEFAULT_RACK;
import static org.apache.bookkeeper.client.BookKeeperClientStats.READ_REQUESTS_REORDERED;
import static org.apache.bookkeeper.client.RegionAwareEnsemblePlacementPolicy.UNKNOWN_REGION;

import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;

import io.netty.util.HashedWheelTimer;

import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Supplier;

import org.apache.bookkeeper.client.BKException.BKNotEnoughBookiesException;
import org.apache.bookkeeper.client.BookieInfoReader.BookieInfo;
import org.apache.bookkeeper.client.WeightedRandomSelection.WeightedObject;
import org.apache.bookkeeper.common.util.ReflectionUtils;
import org.apache.bookkeeper.conf.ClientConfiguration;
import org.apache.bookkeeper.conf.Configurable;
import org.apache.bookkeeper.feature.FeatureProvider;
import org.apache.bookkeeper.net.BookieSocketAddress;
import org.apache.bookkeeper.net.DNSToSwitchMapping;
import org.apache.bookkeeper.net.NetUtils;
import org.apache.bookkeeper.net.NetworkTopology;
import org.apache.bookkeeper.net.NetworkTopologyImpl;
import org.apache.bookkeeper.net.Node;
import org.apache.bookkeeper.net.NodeBase;
import org.apache.bookkeeper.net.ScriptBasedMapping;
import org.apache.bookkeeper.net.StabilizeNetworkTopology;
import org.apache.bookkeeper.stats.Counter;
import org.apache.bookkeeper.stats.Gauge;
import org.apache.bookkeeper.stats.OpStatsLogger;
import org.apache.bookkeeper.stats.StatsLogger;
import org.apache.bookkeeper.stats.annotations.StatsDoc;
import org.apache.commons.collections4.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Simple rackware ensemble placement policy.
 *
 * <p>Make most of the class and methods as protected, so it could be extended to implement other algorithms.
 */
@StatsDoc(name = CLIENT_SCOPE, help = "BookKeeper client stats")
public class RackawareEnsemblePlacementPolicyImpl extends TopologyAwareEnsemblePlacementPolicy {

    static final Logger LOG = LoggerFactory.getLogger(RackawareEnsemblePlacementPolicyImpl.class);
    boolean isWeighted;
    int maxWeightMultiple;
    private Map<BookieNode, WeightedObject> bookieInfoMap = new HashMap<BookieNode, WeightedObject>();
    private WeightedRandomSelection<BookieNode> weightedSelection;

    protected int minNumRacksPerWriteQuorum;
    protected boolean enforceMinNumRacksPerWriteQuorum;
    protected boolean ignoreLocalNodeInPlacementPolicy;

    public static final String REPP_DNS_RESOLVER_CLASS = "reppDnsResolverClass";
    public static final String REPP_RANDOM_READ_REORDERING = "ensembleRandomReadReordering";

    static final int RACKNAME_DISTANCE_FROM_LEAVES = 1;

    // masks for reordering
    static final int LOCAL_MASK = 0x01 << 24;
    static final int LOCAL_FAIL_MASK = 0x02 << 24;
    static final int REMOTE_MASK = 0x04 << 24;
    static final int REMOTE_FAIL_MASK = 0x08 << 24;
    static final int READ_ONLY_MASK = 0x10 << 24;
    static final int SLOW_MASK = 0x20 << 24;
    static final int UNAVAIL_MASK = 0x40 << 24;
    static final int MASK_BITS = 0xFFF << 20;

    static class DefaultResolver implements DNSToSwitchMapping {

        final Supplier<String> defaultRackSupplier;

        public DefaultResolver(Supplier<String> defaultRackSupplier) {
            checkNotNull(defaultRackSupplier, "defaultRackSupplier should not be null");
            this.defaultRackSupplier = defaultRackSupplier;
        }

        @Override
        public List<String> resolve(List<String> names) {
            List<String> rNames = new ArrayList<String>(names.size());
            for (@SuppressWarnings("unused")
            String name : names) {
                final String defaultRack = defaultRackSupplier.get();
                checkNotNull(defaultRack, "defaultRack cannot be null");
                rNames.add(defaultRack);
            }
            return rNames;
        }

        @Override
        public void reloadCachedMappings() {
            // nop
        }

    }

    /**
     * Decorator for any existing dsn resolver.
     * Backfills returned data with appropriate default rack info.
     */
    static class DNSResolverDecorator implements DNSToSwitchMapping {

        final Supplier<String> defaultRackSupplier;
        final DNSToSwitchMapping resolver;
        @StatsDoc(name = FAILED_TO_RESOLVE_NETWORK_LOCATION_COUNTER, help = "total number of times Resolver failed to resolve rack information of a node")
        final Counter failedToResolveNetworkLocationCounter;

        DNSResolverDecorator(DNSToSwitchMapping resolver, Supplier<String> defaultRackSupplier,
                Counter failedToResolveNetworkLocationCounter) {
            checkNotNull(resolver, "Resolver cannot be null");
            checkNotNull(defaultRackSupplier, "defaultRackSupplier should not be null");
            this.defaultRackSupplier = defaultRackSupplier;
            this.resolver = resolver;
            this.failedToResolveNetworkLocationCounter = failedToResolveNetworkLocationCounter;
        }

        public List<String> resolve(List<String> names) {
            if (names == null) {
                return Collections.emptyList();
            }
            final String defaultRack = defaultRackSupplier.get();
            checkNotNull(defaultRack, "Default rack cannot be null");

            List<String> rNames = resolver.resolve(names);
            if (rNames != null && rNames.size() == names.size()) {
                for (int i = 0; i < rNames.size(); ++i) {
                    if (rNames.get(i) == null) {
                        LOG.warn("Failed to resolve network location for {}, using default rack for it : {}.",
                                names.get(i), defaultRack);
                        failedToResolveNetworkLocationCounter.inc();
                        rNames.set(i, defaultRack);
                    }
                }
                return rNames;
            }

            LOG.warn("Failed to resolve network location for {}, using default rack for them : {}.", names,
                    defaultRack);
            rNames = new ArrayList<>(names.size());

            for (int i = 0; i < names.size(); ++i) {
                failedToResolveNetworkLocationCounter.inc();
                rNames.add(defaultRack);
            }
            return rNames;
        }

        @Override
        public boolean useHostName() {
            return resolver.useHostName();
        }

        @Override
        public void reloadCachedMappings() {
            resolver.reloadCachedMappings();
        }
    }

    // for now, we just maintain the writable bookies' topology
    protected NetworkTopology topology;
    protected DNSToSwitchMapping dnsResolver;
    protected HashedWheelTimer timer;
    protected final Map<BookieSocketAddress, BookieNode> knownBookies;
    // Use a loading cache so slow bookies are expired. Use entryId as values.
    protected Cache<BookieSocketAddress, Long> slowBookies;
    protected BookieNode localNode;
    protected final ReentrantReadWriteLock rwLock;
    // Initialize to empty set
    protected ImmutableSet<BookieSocketAddress> readOnlyBookies = ImmutableSet.of();
    protected boolean reorderReadsRandom = false;
    protected boolean enforceDurability = false;
    protected int stabilizePeriodSeconds = 0;
    protected int reorderThresholdPendingRequests = 0;
    // looks like these only assigned in the same thread as constructor, immediately after constructor;
    // no need to make volatile
    protected StatsLogger statsLogger = null;
    @StatsDoc(name = BOOKIES_JOINED, help = "The distribution of number of bookies joined the cluster on each network topology change")
    protected OpStatsLogger bookiesJoinedCounter = null;
    @StatsDoc(name = BOOKIES_LEFT, help = "The distribution of number of bookies left the cluster on each network topology change")
    protected OpStatsLogger bookiesLeftCounter = null;
    @StatsDoc(name = READ_REQUESTS_REORDERED, help = "The distribution of number of bookies reordered on each read request")
    protected OpStatsLogger readReorderedCounter = null;
    @StatsDoc(name = FAILED_TO_RESOLVE_NETWORK_LOCATION_COUNTER, help = "Counter for number of times DNSResolverDecorator failed to resolve Network Location")
    protected Counter failedToResolveNetworkLocationCounter = null;
    @StatsDoc(name = NUM_WRITABLE_BOOKIES_IN_DEFAULT_RACK, help = "Gauge for the number of writable Bookies in default rack")
    protected Gauge<Integer> numWritableBookiesInDefaultRack;

    private String defaultRack = NetworkTopology.DEFAULT_RACK;

    RackawareEnsemblePlacementPolicyImpl() {
        this(false);
    }

    RackawareEnsemblePlacementPolicyImpl(boolean enforceDurability) {
        this.enforceDurability = enforceDurability;
        topology = new NetworkTopologyImpl();
        knownBookies = new HashMap<BookieSocketAddress, BookieNode>();

        rwLock = new ReentrantReadWriteLock();
    }

    protected BookieNode createBookieNode(BookieSocketAddress addr) {
        return new BookieNode(addr, resolveNetworkLocation(addr));
    }

    /**
     * Initialize the policy.
     *
     * @param dnsResolver the object used to resolve addresses to their network address
     * @return initialized ensemble placement policy
     */
    protected RackawareEnsemblePlacementPolicyImpl initialize(DNSToSwitchMapping dnsResolver,
            HashedWheelTimer timer, boolean reorderReadsRandom, int stabilizePeriodSeconds,
            int reorderThresholdPendingRequests, boolean isWeighted, int maxWeightMultiple,
            int minNumRacksPerWriteQuorum, boolean enforceMinNumRacksPerWriteQuorum,
            boolean ignoreLocalNodeInPlacementPolicy, StatsLogger statsLogger) {
        checkNotNull(statsLogger, "statsLogger should not be null, use NullStatsLogger instead.");
        this.statsLogger = statsLogger;
        this.bookiesJoinedCounter = statsLogger.getOpStatsLogger(BOOKIES_JOINED);
        this.bookiesLeftCounter = statsLogger.getOpStatsLogger(BOOKIES_LEFT);
        this.readReorderedCounter = statsLogger.getOpStatsLogger(READ_REQUESTS_REORDERED);
        this.failedToResolveNetworkLocationCounter = statsLogger
                .getCounter(FAILED_TO_RESOLVE_NETWORK_LOCATION_COUNTER);
        this.numWritableBookiesInDefaultRack = new Gauge<Integer>() {
            @Override
            public Integer getDefaultValue() {
                return 0;
            }

            @Override
            public Integer getSample() {
                rwLock.readLock().lock();
                try {
                    return topology.countNumOfAvailableNodes(getDefaultRack(), Collections.emptySet());
                } finally {
                    rwLock.readLock().unlock();
                }
            }
        };
        this.statsLogger.registerGauge(NUM_WRITABLE_BOOKIES_IN_DEFAULT_RACK, numWritableBookiesInDefaultRack);
        this.reorderReadsRandom = reorderReadsRandom;
        this.stabilizePeriodSeconds = stabilizePeriodSeconds;
        this.reorderThresholdPendingRequests = reorderThresholdPendingRequests;
        this.dnsResolver = new DNSResolverDecorator(dnsResolver, () -> this.getDefaultRack(),
                failedToResolveNetworkLocationCounter);
        this.timer = timer;
        this.minNumRacksPerWriteQuorum = minNumRacksPerWriteQuorum;
        this.enforceMinNumRacksPerWriteQuorum = enforceMinNumRacksPerWriteQuorum;
        this.ignoreLocalNodeInPlacementPolicy = ignoreLocalNodeInPlacementPolicy;

        // create the network topology
        if (stabilizePeriodSeconds > 0) {
            this.topology = new StabilizeNetworkTopology(timer, stabilizePeriodSeconds);
        } else {
            this.topology = new NetworkTopologyImpl();
        }

        BookieNode bn = null;
        if (!ignoreLocalNodeInPlacementPolicy) {
            try {
                bn = createBookieNode(new BookieSocketAddress(InetAddress.getLocalHost().getHostAddress(), 0));
            } catch (UnknownHostException e) {
                LOG.error("Failed to get local host address : ", e);
            }
        } else {
            LOG.info("Ignoring LocalNode in Placementpolicy");
        }
        localNode = bn;
        LOG.info("Initialize rackaware ensemble placement policy @ {} @ {} : {}.", localNode,
                null == localNode ? "Unknown" : localNode.getNetworkLocation(), dnsResolver.getClass().getName());

        this.isWeighted = isWeighted;
        if (this.isWeighted) {
            this.maxWeightMultiple = maxWeightMultiple;
            this.weightedSelection = new WeightedRandomSelection<BookieNode>(this.maxWeightMultiple);
            LOG.info("Weight based placement with max multiple of " + this.maxWeightMultiple);
        } else {
            LOG.info("Not weighted");
        }
        return this;
    }

    /*
     * sets default rack for the policy.
     * i.e. region-aware policy may want to have /region/rack while regular
     * rack-aware policy needs /rack only since we cannot mix both styles
     */
    public RackawareEnsemblePlacementPolicyImpl withDefaultRack(String rack) {
        checkNotNull(rack, "Default rack cannot be null");

        this.defaultRack = rack;
        return this;
    }

    public String getDefaultRack() {
        return defaultRack;
    }

    @Override
    public RackawareEnsemblePlacementPolicyImpl initialize(ClientConfiguration conf,
            Optional<DNSToSwitchMapping> optionalDnsResolver, HashedWheelTimer timer,
            FeatureProvider featureProvider, StatsLogger statsLogger) {
        DNSToSwitchMapping dnsResolver;
        if (optionalDnsResolver.isPresent()) {
            dnsResolver = optionalDnsResolver.get();
        } else {
            String dnsResolverName = conf.getString(REPP_DNS_RESOLVER_CLASS, ScriptBasedMapping.class.getName());
            try {
                dnsResolver = ReflectionUtils.newInstance(dnsResolverName, DNSToSwitchMapping.class);
                if (dnsResolver instanceof Configurable) {
                    ((Configurable) dnsResolver).setConf(conf);
                }

                if (dnsResolver instanceof RackChangeNotifier) {
                    ((RackChangeNotifier) dnsResolver).registerRackChangeListener(this);
                }
            } catch (RuntimeException re) {
                if (!conf.getEnforceMinNumRacksPerWriteQuorum()) {
                    LOG.error("Failed to initialize DNS Resolver {}, used default subnet resolver : {}",
                            dnsResolverName, re, re.getMessage());
                    dnsResolver = new DefaultResolver(() -> this.getDefaultRack());
                } else {
                    /*
                     * if minNumRacksPerWriteQuorum is enforced, then it
                     * shouldn't continue in the case of failure to create
                     * dnsResolver.
                     */
                    throw re;
                }
            }
        }
        slowBookies = CacheBuilder.newBuilder()
                .expireAfterWrite(conf.getBookieFailureHistoryExpirationMSec(), TimeUnit.MILLISECONDS)
                .build(new CacheLoader<BookieSocketAddress, Long>() {
                    @Override
                    public Long load(BookieSocketAddress key) throws Exception {
                        return -1L;
                    }
                });
        return initialize(dnsResolver, timer, conf.getBoolean(REPP_RANDOM_READ_REORDERING, false),
                conf.getNetworkTopologyStabilizePeriodSeconds(), conf.getReorderThresholdPendingRequests(),
                conf.getDiskWeightBasedPlacementEnabled(), conf.getBookieMaxWeightMultipleForWeightBasedPlacement(),
                conf.getMinNumRacksPerWriteQuorum(), conf.getEnforceMinNumRacksPerWriteQuorum(),
                conf.getIgnoreLocalNodeInPlacementPolicy(), statsLogger);
    }

    @Override
    public void uninitalize() {
        // do nothing
    }

    protected String resolveNetworkLocation(BookieSocketAddress addr) {
        return NetUtils.resolveNetworkLocation(dnsResolver, addr.getSocketAddress());
    }

    public void onBookieRackChange(List<BookieSocketAddress> bookieAddressList) {
        rwLock.writeLock().lock();
        try {
            for (BookieSocketAddress bookieAddress : bookieAddressList) {
                BookieNode node = knownBookies.get(bookieAddress);
                if (node != null) {
                    // refresh the rack info if its a known bookie
                    topology.remove(node);
                    BookieNode newNode = createBookieNode(bookieAddress);
                    topology.add(newNode);
                    knownBookies.put(bookieAddress, newNode);
                }
            }
        } finally {
            rwLock.writeLock().unlock();
        }
    }

    @Override
    public Set<BookieSocketAddress> onClusterChanged(Set<BookieSocketAddress> writableBookies,
            Set<BookieSocketAddress> readOnlyBookies) {
        rwLock.writeLock().lock();
        try {
            ImmutableSet<BookieSocketAddress> joinedBookies, leftBookies, deadBookies;
            Set<BookieSocketAddress> oldBookieSet = knownBookies.keySet();
            // left bookies : bookies in known bookies, but not in new writable bookie cluster.
            leftBookies = Sets.difference(oldBookieSet, writableBookies).immutableCopy();
            // joined bookies : bookies in new writable bookie cluster, but not in known bookies
            joinedBookies = Sets.difference(writableBookies, oldBookieSet).immutableCopy();
            // dead bookies.
            deadBookies = Sets.difference(leftBookies, readOnlyBookies).immutableCopy();
            LOG.debug("Cluster changed : left bookies are {}, joined bookies are {}, while dead bookies are {}.",
                    leftBookies, joinedBookies, deadBookies);
            handleBookiesThatLeft(leftBookies);
            handleBookiesThatJoined(joinedBookies);
            if (this.isWeighted && (leftBookies.size() > 0 || joinedBookies.size() > 0)) {
                this.weightedSelection.updateMap(this.bookieInfoMap);
            }
            if (!readOnlyBookies.isEmpty()) {
                this.readOnlyBookies = ImmutableSet.copyOf(readOnlyBookies);
            }

            return deadBookies;
        } finally {
            rwLock.writeLock().unlock();
        }
    }

    /*
     * this method should be called in writelock scope of 'rwLock'
     */
    @Override
    public void handleBookiesThatLeft(Set<BookieSocketAddress> leftBookies) {
        for (BookieSocketAddress addr : leftBookies) {
            try {
                BookieNode node = knownBookies.remove(addr);
                if (null != node) {
                    topology.remove(node);
                    if (this.isWeighted) {
                        this.bookieInfoMap.remove(node);
                    }

                    bookiesLeftCounter.registerSuccessfulValue(1L);

                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Cluster changed : bookie {} left from cluster.", addr);
                    }
                }
            } catch (Throwable t) {
                LOG.error("Unexpected exception while handling leaving bookie {}", addr, t);
                if (bookiesLeftCounter != null) {
                    bookiesLeftCounter.registerFailedValue(1L);
                }
                // no need to re-throw; we want to process the rest of the bookies
                // exception anyways will be caught/logged/suppressed in the ZK's event handler
            }
        }
    }

    /*
     * this method should be called in writelock scope of 'rwLock'
     */
    @Override
    public void handleBookiesThatJoined(Set<BookieSocketAddress> joinedBookies) {
        // node joined
        for (BookieSocketAddress addr : joinedBookies) {
            try {
                BookieNode node = createBookieNode(addr);
                topology.add(node);
                knownBookies.put(addr, node);
                if (this.isWeighted) {
                    this.bookieInfoMap.putIfAbsent(node, new BookieInfo());
                }

                bookiesJoinedCounter.registerSuccessfulValue(1L);

                if (LOG.isDebugEnabled()) {
                    LOG.debug("Cluster changed : bookie {} joined the cluster.", addr);
                }
            } catch (Throwable t) {
                // topology.add() throws unchecked exception
                LOG.error("Unexpected exception while handling joining bookie {}", addr, t);

                bookiesJoinedCounter.registerFailedValue(1L);
                // no need to re-throw; we want to process the rest of the bookies
                // exception anyways will be caught/logged/suppressed in the ZK's event handler
            }
        }
    }

    protected Set<Node> convertBookiesToNodes(Collection<BookieSocketAddress> excludeBookies) {
        Set<Node> nodes = new HashSet<Node>();
        for (BookieSocketAddress addr : excludeBookies) {
            BookieNode bn = knownBookies.get(addr);
            if (null == bn) {
                bn = createBookieNode(addr);
            }
            nodes.add(bn);
        }
        return nodes;
    }

    private static Set<String> getNetworkLocations(Set<Node> bookieNodes) {
        Set<String> networkLocs = new HashSet<>();
        for (Node bookieNode : bookieNodes) {
            networkLocs.add(bookieNode.getNetworkLocation());
        }
        return networkLocs;
    }

    /*
     * this method should be called in readlock scope of 'rwLock'
     */
    protected Set<BookieSocketAddress> addDefaultRackBookiesIfMinNumRacksIsEnforced(
            Set<BookieSocketAddress> excludeBookies) {
        Set<BookieSocketAddress> comprehensiveExclusionBookiesSet;
        if (enforceMinNumRacksPerWriteQuorum) {
            Set<BookieSocketAddress> bookiesInDefaultRack = null;
            Set<Node> defaultRackLeaves = topology.getLeaves(getDefaultRack());
            for (Node node : defaultRackLeaves) {
                if (node instanceof BookieNode) {
                    if (bookiesInDefaultRack == null) {
                        bookiesInDefaultRack = new HashSet<BookieSocketAddress>(excludeBookies);
                    }
                    bookiesInDefaultRack.add(((BookieNode) node).getAddr());
                } else {
                    LOG.error("found non-BookieNode: {} as leaf of defaultrack: {}", node, getDefaultRack());
                }
            }
            if ((bookiesInDefaultRack == null) || bookiesInDefaultRack.isEmpty()) {
                comprehensiveExclusionBookiesSet = excludeBookies;
            } else {
                comprehensiveExclusionBookiesSet = new HashSet<BookieSocketAddress>(excludeBookies);
                comprehensiveExclusionBookiesSet.addAll(bookiesInDefaultRack);
                LOG.info("enforceMinNumRacksPerWriteQuorum is enabled, so Excluding bookies of defaultRack: {}",
                        bookiesInDefaultRack);
            }
        } else {
            comprehensiveExclusionBookiesSet = excludeBookies;
        }
        return comprehensiveExclusionBookiesSet;
    }

    @Override
    public PlacementResult<List<BookieSocketAddress>> newEnsemble(int ensembleSize, int writeQuorumSize,
            int ackQuorumSize, Map<String, byte[]> customMetadata, Set<BookieSocketAddress> excludeBookies)
            throws BKNotEnoughBookiesException {
        rwLock.readLock().lock();
        try {
            Set<BookieSocketAddress> comprehensiveExclusionBookiesSet = addDefaultRackBookiesIfMinNumRacksIsEnforced(
                    excludeBookies);
            PlacementResult<List<BookieSocketAddress>> newEnsembleResult = newEnsembleInternal(ensembleSize,
                    writeQuorumSize, ackQuorumSize, comprehensiveExclusionBookiesSet, null, null);
            return newEnsembleResult;
        } finally {
            rwLock.readLock().unlock();
        }
    }

    @Override
    public PlacementResult<List<BookieSocketAddress>> newEnsemble(int ensembleSize, int writeQuorumSize,
            int ackQuorumSize, Set<BookieSocketAddress> excludeBookies, Ensemble<BookieNode> parentEnsemble,
            Predicate<BookieNode> parentPredicate) throws BKNotEnoughBookiesException {
        return newEnsembleInternal(ensembleSize, writeQuorumSize, ackQuorumSize, excludeBookies, parentEnsemble,
                parentPredicate);
    }

    protected PlacementResult<List<BookieSocketAddress>> newEnsembleInternal(int ensembleSize, int writeQuorumSize,
            int ackQuorumSize, Set<BookieSocketAddress> excludeBookies, Ensemble<BookieNode> parentEnsemble,
            Predicate<BookieNode> parentPredicate) throws BKNotEnoughBookiesException {
        rwLock.readLock().lock();
        try {
            Set<Node> excludeNodes = convertBookiesToNodes(excludeBookies);
            int minNumRacksPerWriteQuorumForThisEnsemble = Math.min(writeQuorumSize, minNumRacksPerWriteQuorum);
            RRTopologyAwareCoverageEnsemble ensemble = new RRTopologyAwareCoverageEnsemble(ensembleSize,
                    writeQuorumSize, ackQuorumSize, RACKNAME_DISTANCE_FROM_LEAVES, parentEnsemble, parentPredicate,
                    minNumRacksPerWriteQuorumForThisEnsemble);
            BookieNode prevNode = null;
            int numRacks = topology.getNumOfRacks();
            // only one rack, use the random algorithm.
            if (numRacks < 2) {
                if (enforceMinNumRacksPerWriteQuorum && (minNumRacksPerWriteQuorumForThisEnsemble > 1)) {
                    LOG.error("Only one rack available and minNumRacksPerWriteQuorum is enforced, so giving up");
                    throw new BKNotEnoughBookiesException();
                }
                List<BookieNode> bns = selectRandom(ensembleSize, excludeNodes, TruePredicate.INSTANCE, ensemble);
                ArrayList<BookieSocketAddress> addrs = new ArrayList<BookieSocketAddress>(ensembleSize);
                for (BookieNode bn : bns) {
                    addrs.add(bn.getAddr());
                }
                return PlacementResult.of(addrs, false);
            }

            for (int i = 0; i < ensembleSize; i++) {
                String curRack;
                if (null == prevNode) {
                    if ((null == localNode) || defaultRack.equals(localNode.getNetworkLocation())) {
                        curRack = NodeBase.ROOT;
                    } else {
                        curRack = localNode.getNetworkLocation();
                    }
                } else {
                    curRack = "~" + prevNode.getNetworkLocation();
                }
                boolean firstBookieInTheEnsemble = (null == prevNode);
                prevNode = selectFromNetworkLocation(curRack, excludeNodes, ensemble, ensemble,
                        !enforceMinNumRacksPerWriteQuorum || firstBookieInTheEnsemble);
            }
            List<BookieSocketAddress> bookieList = ensemble.toList();
            if (ensembleSize != bookieList.size()) {
                LOG.error("Not enough {} bookies are available to form an ensemble : {}.", ensembleSize,
                        bookieList);
                throw new BKNotEnoughBookiesException();
            }
            return PlacementResult.of(bookieList,
                    isEnsembleAdheringToPlacementPolicy(bookieList, writeQuorumSize, ackQuorumSize));
        } finally {
            rwLock.readLock().unlock();
        }
    }

    @Override
    public PlacementResult<BookieSocketAddress> replaceBookie(int ensembleSize, int writeQuorumSize,
            int ackQuorumSize, Map<String, byte[]> customMetadata, List<BookieSocketAddress> currentEnsemble,
            BookieSocketAddress bookieToReplace, Set<BookieSocketAddress> excludeBookies)
            throws BKNotEnoughBookiesException {
        rwLock.readLock().lock();
        try {
            excludeBookies = addDefaultRackBookiesIfMinNumRacksIsEnforced(excludeBookies);
            excludeBookies.addAll(currentEnsemble);
            BookieNode bn = knownBookies.get(bookieToReplace);
            if (null == bn) {
                bn = createBookieNode(bookieToReplace);
            }

            Set<Node> ensembleNodes = convertBookiesToNodes(currentEnsemble);
            Set<Node> excludeNodes = convertBookiesToNodes(excludeBookies);

            excludeNodes.addAll(ensembleNodes);
            excludeNodes.add(bn);
            ensembleNodes.remove(bn);

            Set<String> networkLocationsToBeExcluded = getNetworkLocations(ensembleNodes);

            if (LOG.isDebugEnabled()) {
                LOG.debug("Try to choose a new bookie to replace {} from ensemble {}, excluding {}.",
                        bookieToReplace, ensembleNodes, excludeNodes);
            }
            // pick a candidate from same rack to replace
            BookieNode candidate = selectFromNetworkLocation(bn.getNetworkLocation(), networkLocationsToBeExcluded,
                    excludeNodes, TruePredicate.INSTANCE, EnsembleForReplacementWithNoConstraints.INSTANCE,
                    !enforceMinNumRacksPerWriteQuorum);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Bookie {} is chosen to replace bookie {}.", candidate, bn);
            }
            BookieSocketAddress candidateAddr = candidate.getAddr();
            List<BookieSocketAddress> newEnsemble = new ArrayList<BookieSocketAddress>(currentEnsemble);
            if (currentEnsemble.isEmpty()) {
                /*
                 * in testing code there are test cases which would pass empty
                 * currentEnsemble
                 */
                newEnsemble.add(candidateAddr);
            } else {
                newEnsemble.set(currentEnsemble.indexOf(bookieToReplace), candidateAddr);
            }
            return PlacementResult.of(candidateAddr,
                    isEnsembleAdheringToPlacementPolicy(newEnsemble, writeQuorumSize, ackQuorumSize));
        } finally {
            rwLock.readLock().unlock();
        }
    }

    @Override
    public void updateBookieInfo(Map<BookieSocketAddress, BookieInfo> bookieInfoMap) {
        if (!isWeighted) {
            LOG.info("bookieFreeDiskInfo callback called even without weighted placement policy being used.");
            return;
        }
        List<BookieNode> allBookies = new ArrayList<BookieNode>(knownBookies.values());

        // create a new map to reflect the new mapping
        Map<BookieNode, WeightedObject> map = new HashMap<BookieNode, WeightedObject>();
        for (BookieNode bookie : allBookies) {
            if (bookieInfoMap.containsKey(bookie.getAddr())) {
                map.put(bookie, bookieInfoMap.get(bookie.getAddr()));
            } else {
                map.put(bookie, new BookieInfo());
            }
        }
        rwLock.writeLock().lock();
        try {
            this.bookieInfoMap = map;
            this.weightedSelection.updateMap(this.bookieInfoMap);
        } finally {
            rwLock.writeLock().unlock();
        }
    }

    @Override
    public BookieNode selectFromNetworkLocation(String networkLoc, Set<Node> excludeBookies,
            Predicate<BookieNode> predicate, Ensemble<BookieNode> ensemble, boolean fallbackToRandom)
            throws BKNotEnoughBookiesException {
        // select one from local rack
        try {
            return selectRandomFromRack(networkLoc, excludeBookies, predicate, ensemble);
        } catch (BKNotEnoughBookiesException e) {
            if (!fallbackToRandom) {
                LOG.error(
                        "Failed to choose a bookie from {} : "
                                + "excluded {}, enforceMinNumRacksPerWriteQuorum is enabled so giving up.",
                        networkLoc, excludeBookies);
                throw e;
            }
            LOG.warn(
                    "Failed to choose a bookie from {} : "
                            + "excluded {}, fallback to choose bookie randomly from the cluster.",
                    networkLoc, excludeBookies);
            // randomly choose one from whole cluster, ignore the provided predicate.
            return selectRandom(1, excludeBookies, predicate, ensemble).get(0);
        }
    }

    @Override
    public BookieNode selectFromNetworkLocation(String networkLoc, Set<String> excludeRacks,
            Set<Node> excludeBookies, Predicate<BookieNode> predicate, Ensemble<BookieNode> ensemble,
            boolean fallbackToRandom) throws BKNotEnoughBookiesException {
        // first attempt to select one from local rack
        try {
            return selectRandomFromRack(networkLoc, excludeBookies, predicate, ensemble);
        } catch (BKNotEnoughBookiesException e) {
            /*
             * there is no enough bookie from local rack, select bookies from
             * the whole cluster and exclude the racks specified at
             * <tt>excludeRacks</tt>.
             */
            return selectFromNetworkLocation(excludeRacks, excludeBookies, predicate, ensemble, fallbackToRandom);
        }
    }

    /**
     * It randomly selects a {@link BookieNode} that is not on the <i>excludeRacks</i> set, excluding the nodes in
     * <i>excludeBookies</i> set. If it fails to find one, it selects a random {@link BookieNode} from the whole
     * cluster.
     */
    @Override
    public BookieNode selectFromNetworkLocation(Set<String> excludeRacks, Set<Node> excludeBookies,
            Predicate<BookieNode> predicate, Ensemble<BookieNode> ensemble, boolean fallbackToRandom)
            throws BKNotEnoughBookiesException {

        List<BookieNode> knownNodes = new ArrayList<>(knownBookies.values());
        Set<Node> fullExclusionBookiesList = new HashSet<Node>(excludeBookies);
        for (BookieNode knownNode : knownNodes) {
            if (excludeRacks.contains(knownNode.getNetworkLocation())) {
                fullExclusionBookiesList.add(knownNode);
            }
        }

        try {
            return selectRandomInternal(knownNodes, 1, fullExclusionBookiesList, predicate, ensemble).get(0);
        } catch (BKNotEnoughBookiesException e) {
            if (!fallbackToRandom) {
                LOG.error(
                        "Failed to choose a bookie excluding Racks: {} "
                                + "Nodes: {}, enforceMinNumRacksPerWriteQuorum is enabled so giving up.",
                        excludeRacks, excludeBookies);
                throw e;
            }

            LOG.warn("Failed to choose a bookie: excluded {}, fallback to choose bookie randomly from the cluster.",
                    excludeBookies);
            // randomly choose one from whole cluster
            return selectRandom(1, excludeBookies, predicate, ensemble).get(0);
        }
    }

    private WeightedRandomSelection<BookieNode> prepareForWeightedSelection(List<Node> leaves) {
        // create a map of bookieNode->freeDiskSpace for this rack. The assumption is that
        // the number of nodes in a rack is of the order of 40, so it shouldn't be too bad
        // to build it every time during a ledger creation
        Map<BookieNode, WeightedObject> rackMap = new HashMap<BookieNode, WeightedObject>();
        for (Node n : leaves) {
            if (!(n instanceof BookieNode)) {
                continue;
            }
            BookieNode bookie = (BookieNode) n;
            if (this.bookieInfoMap.containsKey(bookie)) {
                rackMap.put(bookie, this.bookieInfoMap.get(bookie));
            } else {
                rackMap.put(bookie, new BookieInfo());
            }
        }
        if (rackMap.size() == 0) {
            return null;
        }

        WeightedRandomSelection<BookieNode> wRSelection = new WeightedRandomSelection<BookieNode>(
                maxWeightMultiple);
        wRSelection.updateMap(rackMap);
        return wRSelection;
    }

    /**
     * Choose random node under a given network path.
     *
     * @param netPath
     *          network path
     * @param excludeBookies
     *          exclude bookies
     * @param predicate
     *          predicate to check whether the target is a good target.
     * @param ensemble
     *          ensemble structure
     * @return chosen bookie.
     */
    protected BookieNode selectRandomFromRack(String netPath, Set<Node> excludeBookies,
            Predicate<BookieNode> predicate, Ensemble<BookieNode> ensemble) throws BKNotEnoughBookiesException {
        WeightedRandomSelection<BookieNode> wRSelection = null;
        List<Node> leaves = new ArrayList<Node>(topology.getLeaves(netPath));
        if (!this.isWeighted) {
            Collections.shuffle(leaves);
        } else {
            if (CollectionUtils.subtract(leaves, excludeBookies).size() < 1) {
                throw new BKNotEnoughBookiesException();
            }
            wRSelection = prepareForWeightedSelection(leaves);
            if (wRSelection == null) {
                throw new BKNotEnoughBookiesException();
            }
        }

        Iterator<Node> it = leaves.iterator();
        Set<Node> bookiesSeenSoFar = new HashSet<Node>();
        while (true) {
            Node n;
            if (isWeighted) {
                if (bookiesSeenSoFar.size() == leaves.size()) {
                    // Don't loop infinitely.
                    break;
                }
                n = wRSelection.getNextRandom();
                bookiesSeenSoFar.add(n);
            } else {
                if (it.hasNext()) {
                    n = it.next();
                } else {
                    break;
                }
            }
            if (excludeBookies.contains(n)) {
                continue;
            }
            if (!(n instanceof BookieNode) || !predicate.apply((BookieNode) n, ensemble)) {
                continue;
            }
            BookieNode bn = (BookieNode) n;
            // got a good candidate
            if (ensemble.addNode(bn)) {
                // add the candidate to exclude set
                excludeBookies.add(bn);
            }
            return bn;
        }
        throw new BKNotEnoughBookiesException();
    }

    /**
     * Choose a random node from whole cluster.
     *
     * @param numBookies
     *          number bookies to choose
     * @param excludeBookies
     *          bookies set to exclude.
     * @param ensemble
     *          ensemble to hold the bookie chosen.
     * @return the bookie node chosen.
     * @throws BKNotEnoughBookiesException
     */
    protected List<BookieNode> selectRandom(int numBookies, Set<Node> excludeBookies,
            Predicate<BookieNode> predicate, Ensemble<BookieNode> ensemble) throws BKNotEnoughBookiesException {
        return selectRandomInternal(null, numBookies, excludeBookies, predicate, ensemble);
    }

    protected List<BookieNode> selectRandomInternal(List<BookieNode> bookiesToSelectFrom, int numBookies,
            Set<Node> excludeBookies, Predicate<BookieNode> predicate, Ensemble<BookieNode> ensemble)
            throws BKNotEnoughBookiesException {
        WeightedRandomSelection<BookieNode> wRSelection = null;
        if (bookiesToSelectFrom == null) {
            // If the list is null, we need to select from the entire knownBookies set
            wRSelection = this.weightedSelection;
            bookiesToSelectFrom = new ArrayList<BookieNode>(knownBookies.values());
        }
        if (isWeighted) {
            if (CollectionUtils.subtract(bookiesToSelectFrom, excludeBookies).size() < numBookies) {
                throw new BKNotEnoughBookiesException();
            }
            if (wRSelection == null) {
                Map<BookieNode, WeightedObject> rackMap = new HashMap<BookieNode, WeightedObject>();
                for (BookieNode n : bookiesToSelectFrom) {
                    if (excludeBookies.contains(n)) {
                        continue;
                    }
                    if (this.bookieInfoMap.containsKey(n)) {
                        rackMap.put(n, this.bookieInfoMap.get(n));
                    } else {
                        rackMap.put(n, new BookieInfo());
                    }
                }
                wRSelection = new WeightedRandomSelection<BookieNode>(this.maxWeightMultiple);
                wRSelection.updateMap(rackMap);
            }
        } else {
            Collections.shuffle(bookiesToSelectFrom);
        }

        BookieNode bookie;
        List<BookieNode> newBookies = new ArrayList<BookieNode>(numBookies);
        Iterator<BookieNode> it = bookiesToSelectFrom.iterator();
        Set<BookieNode> bookiesSeenSoFar = new HashSet<BookieNode>();
        while (numBookies > 0) {
            if (isWeighted) {
                if (bookiesSeenSoFar.size() == bookiesToSelectFrom.size()) {
                    // If we have gone through the whole available list of bookies,
                    // and yet haven't been able to satisfy the ensemble request, bail out.
                    // We don't want to loop infinitely.
                    break;
                }
                bookie = wRSelection.getNextRandom();
                bookiesSeenSoFar.add(bookie);
            } else {
                if (it.hasNext()) {
                    bookie = it.next();
                } else {
                    break;
                }
            }
            if (excludeBookies.contains(bookie)) {
                continue;
            }

            // When durability is being enforced; we must not violate the
            // predicate even when selecting a random bookie; as durability
            // guarantee is not best effort; correctness is implied by it
            if (enforceDurability && !predicate.apply(bookie, ensemble)) {
                continue;
            }

            if (ensemble.addNode(bookie)) {
                excludeBookies.add(bookie);
                newBookies.add(bookie);
                --numBookies;
            }
        }
        if (numBookies == 0) {
            return newBookies;
        }
        LOG.warn("Failed to find {} bookies : excludeBookies {}, allBookies {}.", numBookies, excludeBookies,
                bookiesToSelectFrom);

        throw new BKNotEnoughBookiesException();
    }

    @Override
    public void registerSlowBookie(BookieSocketAddress bookieSocketAddress, long entryId) {
        if (reorderThresholdPendingRequests <= 0) {
            // only put bookies on slowBookies list if reorderThresholdPendingRequests is *not* set (0);
            // otherwise, rely on reordering of reads based on reorderThresholdPendingRequests
            slowBookies.put(bookieSocketAddress, entryId);
        }
    }

    @Override
    public DistributionSchedule.WriteSet reorderReadSequence(List<BookieSocketAddress> ensemble,
            BookiesHealthInfo bookiesHealthInfo, DistributionSchedule.WriteSet writeSet) {
        Map<Integer, String> writeSetWithRegion = new HashMap<>();
        for (int i = 0; i < writeSet.size(); i++) {
            writeSetWithRegion.put(writeSet.get(i), "");
        }
        return reorderReadSequenceWithRegion(ensemble, writeSet, writeSetWithRegion, bookiesHealthInfo, false, "",
                writeSet.size());
    }

    /**
     * This function orders the read sequence with a given region. For region-unaware policies (e.g.
     * RackAware), we pass in false for regionAware and an empty myRegion. When this happens, any
     * remote list will stay empty. The ordering is as follows (the R* at the beginning of each list item
     * is only present for region aware policies).
     *      1. available (local) bookies
     *      2. R* a remote bookie (based on remoteNodeInReorderSequence
     *      3. R* remaining (local) bookies
     *      4. R* remaining remote bookies
     *      5. read only bookies
     *      6. slow bookies
     *      7. unavailable bookies
     *
     * @param ensemble
     *          ensemble of bookies
     * @param writeSet
     *          write set
     * @param writeSetWithRegion
     *          write set with region information
     * @param bookiesHealthInfo
     *          heuristics about health of boookies
     * @param regionAware
     *          whether or not a region-aware policy is used
     * @param myRegion
     *          current region of policy
     * @param remoteNodeInReorderSequence
     *          number of local bookies to try before trying a remote bookie
     * @return ordering of bookies to send read to
     */
    DistributionSchedule.WriteSet reorderReadSequenceWithRegion(List<BookieSocketAddress> ensemble,
            DistributionSchedule.WriteSet writeSet, Map<Integer, String> writeSetWithRegion,
            BookiesHealthInfo bookiesHealthInfo, boolean regionAware, String myRegion,
            int remoteNodeInReorderSequence) {
        boolean useRegionAware = regionAware && (!myRegion.equals(UNKNOWN_REGION));
        int ensembleSize = ensemble.size();

        // For rack aware, If all the bookies in the write set are available, simply return the original write set,
        // to avoid creating more lists
        boolean isAnyBookieUnavailable = false;

        if (useRegionAware || reorderReadsRandom) {
            isAnyBookieUnavailable = true;
        } else {
            for (int i = 0; i < ensemble.size(); i++) {
                BookieSocketAddress bookieAddr = ensemble.get(i);
                if ((!knownBookies.containsKey(bookieAddr) && !readOnlyBookies.contains(bookieAddr))
                        || slowBookies.getIfPresent(bookieAddr) != null) {
                    // Found at least one bookie not available in the ensemble, or in slowBookies
                    isAnyBookieUnavailable = true;
                    break;
                }
            }
        }

        boolean reordered = false;
        if (reorderThresholdPendingRequests > 0) {
            // if there are no slow or unavailable bookies, capture each bookie's number of
            // pending request to reorder requests based on a threshold of pending requests

            // number of pending requests per bookie (same index as writeSet)
            long[] pendingReqs = new long[writeSet.size()];
            int bestBookieIdx = -1;

            for (int i = 0; i < writeSet.size(); i++) {
                pendingReqs[i] = bookiesHealthInfo.getBookiePendingRequests(ensemble.get(writeSet.get(i)));
                if (bestBookieIdx < 0 || pendingReqs[i] < pendingReqs[bestBookieIdx]) {
                    bestBookieIdx = i;
                }
            }

            // reorder the writeSet if the currently first bookie in our writeSet has at
            // least
            // reorderThresholdPendingRequests more outstanding request than the best bookie
            if (bestBookieIdx > 0
                    && pendingReqs[0] >= pendingReqs[bestBookieIdx] + reorderThresholdPendingRequests) {
                // We're not reordering the entire write set, but only move the best bookie
                // to the first place. Chances are good that this bookie will be fast enough
                // to not trigger the speculativeReadTimeout. But even if it hits that timeout,
                // things may have changed by then so much that whichever bookie we put second
                // may actually not be the second-best choice any more.
                if (LOG.isDebugEnabled()) {
                    LOG.debug("read set reordered from {} ({} pending) to {} ({} pending)",
                            ensemble.get(writeSet.get(0)), pendingReqs[0],
                            ensemble.get(writeSet.get(bestBookieIdx)), pendingReqs[bestBookieIdx]);
                }
                writeSet.moveAndShift(bestBookieIdx, 0);
                reordered = true;
            }
        }

        if (!isAnyBookieUnavailable) {
            if (reordered) {
                readReorderedCounter.registerSuccessfulValue(1);
            }
            return writeSet;
        }

        for (int i = 0; i < writeSet.size(); i++) {
            int idx = writeSet.get(i);
            BookieSocketAddress address = ensemble.get(idx);
            String region = writeSetWithRegion.get(idx);
            Long lastFailedEntryOnBookie = bookiesHealthInfo.getBookieFailureHistory(address);
            if (null == knownBookies.get(address)) {
                // there isn't too much differences between readonly bookies
                // from unavailable bookies. since there
                // is no write requests to them, so we shouldn't try reading
                // from readonly bookie prior to writable bookies.
                if ((null == readOnlyBookies) || !readOnlyBookies.contains(address)) {
                    writeSet.set(i, idx | UNAVAIL_MASK);
                } else {
                    if (slowBookies.getIfPresent(address) != null) {
                        long numPendingReqs = bookiesHealthInfo.getBookiePendingRequests(address);
                        // use slow bookies with less pending requests first
                        long slowIdx = numPendingReqs * ensembleSize + idx;
                        writeSet.set(i, (int) (slowIdx & ~MASK_BITS) | SLOW_MASK);
                    } else {
                        writeSet.set(i, idx | READ_ONLY_MASK);
                    }
                }
            } else if (lastFailedEntryOnBookie < 0) {
                if (slowBookies.getIfPresent(address) != null) {
                    long numPendingReqs = bookiesHealthInfo.getBookiePendingRequests(address);
                    long slowIdx = numPendingReqs * ensembleSize + idx;
                    writeSet.set(i, (int) (slowIdx & ~MASK_BITS) | SLOW_MASK);
                } else {
                    if (useRegionAware && !myRegion.equals(region)) {
                        writeSet.set(i, idx | REMOTE_MASK);
                    } else {
                        writeSet.set(i, idx | LOCAL_MASK);
                    }
                }
            } else {
                // use bookies with earlier failed entryIds first
                long failIdx = lastFailedEntryOnBookie * ensembleSize + idx;
                if (useRegionAware && !myRegion.equals(region)) {
                    writeSet.set(i, (int) (failIdx & ~MASK_BITS) | REMOTE_FAIL_MASK);
                } else {
                    writeSet.set(i, (int) (failIdx & ~MASK_BITS) | LOCAL_FAIL_MASK);
                }
            }
        }

        // Add a mask to ensure the sort is stable, sort,
        // and then remove mask. This maintains stability as
        // long as there are fewer than 16 bookies in the write set.
        for (int i = 0; i < writeSet.size(); i++) {
            writeSet.set(i, writeSet.get(i) | ((i & 0xF) << 20));
        }
        writeSet.sort();
        for (int i = 0; i < writeSet.size(); i++) {
            writeSet.set(i, writeSet.get(i) & ~((0xF) << 20));
        }

        if (reorderReadsRandom) {
            shuffleWithMask(writeSet, LOCAL_MASK, MASK_BITS);
            shuffleWithMask(writeSet, REMOTE_MASK, MASK_BITS);
            shuffleWithMask(writeSet, READ_ONLY_MASK, MASK_BITS);
            shuffleWithMask(writeSet, UNAVAIL_MASK, MASK_BITS);
        }

        // nodes within a region are ordered as follows
        // (Random?) list of nodes that have no history of failure
        // Nodes with Failure history are ordered in the reverse
        // order of the most recent entry that generated an error
        // The sort will have put them in correct order,
        // so remove the bits that sort by age.
        for (int i = 0; i < writeSet.size(); i++) {
            int mask = writeSet.get(i) & MASK_BITS;
            int idx = (writeSet.get(i) & ~MASK_BITS) % ensembleSize;
            if (mask == LOCAL_FAIL_MASK) {
                writeSet.set(i, LOCAL_MASK | idx);
            } else if (mask == REMOTE_FAIL_MASK) {
                writeSet.set(i, REMOTE_MASK | idx);
            } else if (mask == SLOW_MASK) {
                writeSet.set(i, SLOW_MASK | idx);
            }
        }

        // Insert a node from the remote region at the specified location so
        // we try more than one region within the max allowed latency
        int firstRemote = -1;
        for (int i = 0; i < writeSet.size(); i++) {
            if ((writeSet.get(i) & MASK_BITS) == REMOTE_MASK) {
                firstRemote = i;
                break;
            }
        }
        if (firstRemote != -1) {
            int i = 0;
            for (; i < remoteNodeInReorderSequence && i < writeSet.size(); i++) {
                if ((writeSet.get(i) & MASK_BITS) != LOCAL_MASK) {
                    break;
                }
            }
            writeSet.moveAndShift(firstRemote, i);
        }

        // remove all masks
        for (int i = 0; i < writeSet.size(); i++) {
            writeSet.set(i, writeSet.get(i) & ~MASK_BITS);
        }
        readReorderedCounter.registerSuccessfulValue(1);
        return writeSet;
    }

    /**
     * Shuffle all the entries of an array that matches a mask.
     * It assumes all entries with the same mask are contiguous in the array.
     */
    static void shuffleWithMask(DistributionSchedule.WriteSet writeSet, int mask, int bits) {
        int first = -1;
        int last = -1;
        for (int i = 0; i < writeSet.size(); i++) {
            if ((writeSet.get(i) & bits) == mask) {
                if (first == -1) {
                    first = i;
                }
                last = i;
            }
        }
        if (first != -1) {
            for (int i = last + 1; i > first; i--) {
                int swapWith = ThreadLocalRandom.current().nextInt(i);
                writeSet.set(swapWith, writeSet.set(i, writeSet.get(swapWith)));
            }
        }
    }

    // this method should be called in readlock scope of 'rwlock'
    @Override
    public boolean isEnsembleAdheringToPlacementPolicy(List<BookieSocketAddress> ensembleList, int writeQuorumSize,
            int ackQuorumSize) {
        int ensembleSize = ensembleList.size();
        int minNumRacksPerWriteQuorumForThisEnsemble = Math.min(writeQuorumSize, minNumRacksPerWriteQuorum);
        HashSet<String> racksInQuorum = new HashSet<String>();
        BookieSocketAddress bookie;
        for (int i = 0; i < ensembleList.size(); i++) {
            racksInQuorum.clear();
            for (int j = 0; j < writeQuorumSize; j++) {
                bookie = ensembleList.get((i + j) % ensembleSize);
                try {
                    racksInQuorum.add(knownBookies.get(bookie).getNetworkLocation());
                } catch (Exception e) {
                    /*
                     * any issue/exception in analyzing whether ensemble is
                     * strictly adhering to placement policy should be
                     * swallowed.
                     */
                    LOG.warn("Received exception while trying to get network location of bookie: {}", bookie, e);
                }
            }
            if ((racksInQuorum.size() < minNumRacksPerWriteQuorumForThisEnsemble)
                    || (enforceMinNumRacksPerWriteQuorum && racksInQuorum.contains(getDefaultRack()))) {
                return false;
            }
        }
        return true;
    }
}